mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
wip major changes
This commit is contained in:
@@ -1,16 +1,13 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__order__ = 100
|
||||
import abx
|
||||
|
||||
@abx.hookimpl
|
||||
|
||||
def register_admin(admin_site):
|
||||
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
|
||||
from core.admin import register_admin
|
||||
register_admin(admin_site)
|
||||
from core.admin import register_admin as do_register
|
||||
do_register(admin_site)
|
||||
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from archivebox.config.common import (
|
||||
SHELL_CONFIG,
|
||||
@@ -28,4 +25,3 @@ def get_CONFIG():
|
||||
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
|
||||
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
|
||||
}
|
||||
|
||||
|
||||
@@ -9,10 +9,7 @@ from core.admin_snapshots import SnapshotAdmin
|
||||
from core.admin_archiveresults import ArchiveResultAdmin
|
||||
from core.admin_users import UserAdmin
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def register_admin(admin_site):
|
||||
admin_site.register(get_user_model(), UserAdmin)
|
||||
admin_site.register(ArchiveResult, ArchiveResultAdmin)
|
||||
|
||||
@@ -11,8 +11,6 @@ from django.utils import timezone
|
||||
|
||||
from huey_monitor.admin import TaskModel
|
||||
|
||||
import abx
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.misc.paginators import AccelleratedPaginator
|
||||
@@ -43,7 +41,6 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
ordering = ('end_ts',)
|
||||
show_change_link = True
|
||||
# # classes = ['collapse']
|
||||
# # list_display_links = ['abid']
|
||||
|
||||
def get_parent_object_from_request(self, request):
|
||||
resolved = resolve(request.path_info)
|
||||
@@ -80,7 +77,7 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
formset.form.base_fields['start_ts'].initial = timezone.now()
|
||||
formset.form.base_fields['end_ts'].initial = timezone.now()
|
||||
formset.form.base_fields['cmd_version'].initial = '-'
|
||||
formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
|
||||
formset.form.base_fields['pwd'].initial = str(snapshot.output_dir)
|
||||
formset.form.base_fields['created_by'].initial = request.user
|
||||
formset.form.base_fields['cmd'].initial = '["-"]'
|
||||
formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
|
||||
@@ -193,6 +190,5 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def register_admin(admin_site):
|
||||
admin_site.register(ArchiveResult, ArchiveResultAdmin)
|
||||
|
||||
@@ -36,7 +36,7 @@ def register_admin_site():
|
||||
admin.site = archivebox_admin
|
||||
sites.site = archivebox_admin
|
||||
|
||||
# register all plugins admin classes
|
||||
archivebox.pm.hook.register_admin(admin_site=archivebox_admin)
|
||||
# Plugin admin registration is now handled by individual app admins
|
||||
# No longer using archivebox.pm.hook.register_admin()
|
||||
|
||||
return archivebox_admin
|
||||
|
||||
@@ -19,11 +19,9 @@ from archivebox.misc.util import htmldecode, urldecode
|
||||
from archivebox.misc.paginators import AccelleratedPaginator
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
from archivebox.search.admin import SearchResultsAdminMixin
|
||||
from archivebox.index.html import snapshot_icons
|
||||
from archivebox.extractors import archive_links
|
||||
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
from archivebox.workers.tasks import bg_archive_links, bg_add
|
||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
|
||||
|
||||
from core.models import Tag
|
||||
from core.admin_tags import TagInline
|
||||
@@ -53,13 +51,13 @@ class SnapshotActionForm(ActionForm):
|
||||
# )
|
||||
|
||||
|
||||
class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
|
||||
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
|
||||
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
|
||||
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir')
|
||||
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir', 'available_config_options')
|
||||
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
|
||||
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
|
||||
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', *readonly_fields)
|
||||
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', 'available_config_options', *readonly_fields[:-1])
|
||||
ordering = ['-created_at']
|
||||
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
|
||||
inlines = [TagInline, ArchiveResultInline]
|
||||
@@ -196,14 +194,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
|
||||
)
|
||||
def files(self, obj):
|
||||
# return '-'
|
||||
return snapshot_icons(obj)
|
||||
return obj.icons()
|
||||
|
||||
|
||||
@admin.display(
|
||||
# ordering='archiveresult_count'
|
||||
)
|
||||
def size(self, obj):
|
||||
archive_size = os.access(Path(obj.link_dir) / 'index.html', os.F_OK) and obj.archive_size
|
||||
archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size
|
||||
if archive_size:
|
||||
size_txt = printable_filesize(archive_size)
|
||||
if archive_size > 52428800:
|
||||
@@ -261,30 +259,27 @@ class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
|
||||
description="ℹ️ Get Title"
|
||||
)
|
||||
def update_titles(self, request, queryset):
|
||||
links = [snapshot.as_link() for snapshot in queryset]
|
||||
if len(links) < 3:
|
||||
# run syncronously if there are only 1 or 2 links
|
||||
archive_links(links, overwrite=True, methods=('title','favicon'), out_dir=DATA_DIR)
|
||||
messages.success(request, f"Title and favicon have been fetched and saved for {len(links)} URLs.")
|
||||
else:
|
||||
# otherwise run in a background worker
|
||||
result = bg_archive_links((links,), kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
|
||||
messages.success(
|
||||
request,
|
||||
mark_safe(f"Title and favicon are updating in the background for {len(links)} URLs. {result_url(result)}"),
|
||||
)
|
||||
from core.models import Snapshot
|
||||
count = queryset.count()
|
||||
|
||||
# Queue snapshots for archiving via the state machine system
|
||||
result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
|
||||
messages.success(
|
||||
request,
|
||||
mark_safe(f"Title and favicon are updating in the background for {count} URLs. {result_url(result)}"),
|
||||
)
|
||||
|
||||
@admin.action(
|
||||
description="⬇️ Get Missing"
|
||||
)
|
||||
def update_snapshots(self, request, queryset):
|
||||
links = [snapshot.as_link() for snapshot in queryset]
|
||||
count = queryset.count()
|
||||
|
||||
result = bg_archive_links((links,), kwargs={"overwrite": False, "out_dir": DATA_DIR})
|
||||
result = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
mark_safe(f"Re-trying any previously failed methods for {len(links)} URLs in the background. {result_url(result)}"),
|
||||
mark_safe(f"Re-trying any previously failed methods for {count} URLs in the background. {result_url(result)}"),
|
||||
)
|
||||
|
||||
|
||||
@@ -307,13 +302,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
|
||||
description="🔄 Redo"
|
||||
)
|
||||
def overwrite_snapshots(self, request, queryset):
|
||||
links = [snapshot.as_link() for snapshot in queryset]
|
||||
count = queryset.count()
|
||||
|
||||
result = bg_archive_links((links,), kwargs={"overwrite": True, "out_dir": DATA_DIR})
|
||||
result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
mark_safe(f"Clearing all previous results and re-downloading {len(links)} URLs in the background. {result_url(result)}"),
|
||||
mark_safe(f"Clearing all previous results and re-downloading {count} URLs in the background. {result_url(result)}"),
|
||||
)
|
||||
|
||||
@admin.action(
|
||||
|
||||
@@ -3,8 +3,6 @@ __package__ = 'archivebox.core'
|
||||
from django.contrib import admin
|
||||
from django.utils.html import format_html, mark_safe
|
||||
|
||||
import abx
|
||||
|
||||
from archivebox.misc.paginators import AccelleratedPaginator
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
|
||||
@@ -150,7 +148,7 @@ class TagAdmin(BaseModelAdmin):
|
||||
|
||||
|
||||
# @admin.register(SnapshotTag, site=archivebox_admin)
|
||||
# class SnapshotTagAdmin(ABIDModelAdmin):
|
||||
# class SnapshotTagAdmin(BaseModelAdmin):
|
||||
# list_display = ('id', 'snapshot', 'tag')
|
||||
# sort_fields = ('id', 'snapshot', 'tag')
|
||||
# search_fields = ('id', 'snapshot_id', 'tag_id')
|
||||
@@ -159,7 +157,6 @@ class TagAdmin(BaseModelAdmin):
|
||||
# ordering = ['-id']
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def register_admin(admin_site):
|
||||
admin_site.register(Tag, TagAdmin)
|
||||
|
||||
|
||||
@@ -5,8 +5,6 @@ from django.contrib.auth.admin import UserAdmin
|
||||
from django.utils.html import format_html, mark_safe
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
class CustomUserAdmin(UserAdmin):
|
||||
sort_fields = ['id', 'email', 'username', 'is_superuser', 'last_login', 'date_joined']
|
||||
@@ -86,6 +84,5 @@ class CustomUserAdmin(UserAdmin):
|
||||
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def register_admin(admin_site):
|
||||
admin_site.register(get_user_model(), CustomUserAdmin)
|
||||
|
||||
@@ -2,17 +2,12 @@ __package__ = 'archivebox.core'
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
import archivebox
|
||||
|
||||
|
||||
class CoreConfig(AppConfig):
|
||||
name = 'core'
|
||||
|
||||
def ready(self):
|
||||
"""Register the archivebox.core.admin_site as the main django admin site"""
|
||||
from django.conf import settings
|
||||
archivebox.pm.hook.ready(settings=settings)
|
||||
|
||||
from core.admin_site import register_admin_site
|
||||
register_admin_site()
|
||||
|
||||
|
||||
@@ -3,37 +3,34 @@ __package__ = 'archivebox.core'
|
||||
from django import forms
|
||||
|
||||
from archivebox.misc.util import URL_REGEX
|
||||
from ..parsers import PARSERS
|
||||
from taggit.utils import edit_string_for_tags, parse_tags
|
||||
|
||||
PARSER_CHOICES = [
|
||||
(parser_key, parser[0])
|
||||
for parser_key, parser in PARSERS.items()
|
||||
]
|
||||
DEPTH_CHOICES = (
|
||||
('0', 'depth = 0 (archive just these URLs)'),
|
||||
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
|
||||
)
|
||||
|
||||
from ..extractors import get_default_archive_methods
|
||||
from archivebox.hooks import get_extractors
|
||||
|
||||
ARCHIVE_METHODS = [
|
||||
(name, name)
|
||||
for name, _, _ in get_default_archive_methods()
|
||||
]
|
||||
def get_archive_methods():
|
||||
"""Get available archive methods from discovered hooks."""
|
||||
return [(name, name) for name in get_extractors()]
|
||||
|
||||
|
||||
class AddLinkForm(forms.Form):
|
||||
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
|
||||
parser = forms.ChoiceField(label="URLs format", choices=[('auto', 'Auto-detect parser'), *PARSER_CHOICES], initial='auto')
|
||||
tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
|
||||
depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
|
||||
archive_methods = forms.MultipleChoiceField(
|
||||
label="Archive methods (select at least 1, otherwise all will be used by default)",
|
||||
required=False,
|
||||
widget=forms.SelectMultiple,
|
||||
choices=ARCHIVE_METHODS,
|
||||
choices=[], # populated dynamically in __init__
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.fields['archive_methods'].choices = get_archive_methods()
|
||||
# TODO: hook these up to the view and put them
|
||||
# in a collapsible UI section labeled "Advanced"
|
||||
#
|
||||
|
||||
@@ -1,18 +1,14 @@
|
||||
# Generated by Django 3.0.8 on 2020-11-04 12:25
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
from config import CONFIG
|
||||
from index.json import to_json
|
||||
|
||||
DATA_DIR = Path(os.getcwd()).resolve() # archivebox user data dir
|
||||
ARCHIVE_DIR = DATA_DIR / 'archive' # archivebox snapshot data dir
|
||||
|
||||
|
||||
try:
|
||||
JSONField = models.JSONField
|
||||
except AttributeError:
|
||||
@@ -21,12 +17,14 @@ except AttributeError:
|
||||
|
||||
|
||||
def forwards_func(apps, schema_editor):
|
||||
from core.models import EXTRACTORS
|
||||
|
||||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||
|
||||
snapshots = Snapshot.objects.all()
|
||||
for snapshot in snapshots:
|
||||
out_dir = ARCHIVE_DIR / snapshot.timestamp
|
||||
out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
|
||||
|
||||
try:
|
||||
with open(out_dir / "index.json", "r") as f:
|
||||
@@ -61,7 +59,7 @@ def forwards_func(apps, schema_editor):
|
||||
|
||||
def verify_json_index_integrity(snapshot):
|
||||
results = snapshot.archiveresult_set.all()
|
||||
out_dir = ARCHIVE_DIR / snapshot.timestamp
|
||||
out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
|
||||
with open(out_dir / "index.json", "r") as f:
|
||||
index = json.load(f)
|
||||
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-05-13 10:56
|
||||
|
||||
import charidfield.fields
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0022_auto_20231023_2008'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='archiveresult',
|
||||
options={'verbose_name': 'Result'},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='abid',
|
||||
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='res_', unique=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='abid',
|
||||
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='snp_', unique=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, null=True, unique=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='abid',
|
||||
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='tag_', unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(choices=(
|
||||
('htmltotext', 'htmltotext'),
|
||||
('git', 'git'),
|
||||
('singlefile', 'singlefile'),
|
||||
('media', 'media'),
|
||||
('archive_org', 'archive_org'),
|
||||
('readability', 'readability'),
|
||||
('mercury', 'mercury'),
|
||||
('favicon', 'favicon'),
|
||||
('pdf', 'pdf'),
|
||||
('headers', 'headers'),
|
||||
('screenshot', 'screenshot'),
|
||||
('dom', 'dom'),
|
||||
('title', 'title'),
|
||||
('wget', 'wget'),
|
||||
), max_length=32),
|
||||
),
|
||||
]
|
||||
466
archivebox/core/migrations/0023_new_schema.py
Normal file
466
archivebox/core/migrations/0023_new_schema.py
Normal file
@@ -0,0 +1,466 @@
|
||||
# Generated by Django 5.0.6 on 2024-12-25
|
||||
# Transforms schema from 0022 to new simplified schema (ABID system removed)
|
||||
|
||||
from uuid import uuid4
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
|
||||
|
||||
def get_or_create_system_user_pk(apps, schema_editor):
|
||||
"""Get or create system user for migrations."""
|
||||
User = apps.get_model('auth', 'User')
|
||||
user, _ = User.objects.get_or_create(
|
||||
username='system',
|
||||
defaults={'is_active': False, 'password': '!'}
|
||||
)
|
||||
return user.pk
|
||||
|
||||
|
||||
def populate_created_by_snapshot(apps, schema_editor):
|
||||
"""Populate created_by for existing snapshots."""
|
||||
User = apps.get_model('auth', 'User')
|
||||
Snapshot = apps.get_model('core', 'Snapshot')
|
||||
|
||||
system_user, _ = User.objects.get_or_create(
|
||||
username='system',
|
||||
defaults={'is_active': False, 'password': '!'}
|
||||
)
|
||||
|
||||
Snapshot.objects.filter(created_by__isnull=True).update(created_by=system_user)
|
||||
|
||||
|
||||
def populate_created_by_archiveresult(apps, schema_editor):
|
||||
"""Populate created_by for existing archive results."""
|
||||
User = apps.get_model('auth', 'User')
|
||||
ArchiveResult = apps.get_model('core', 'ArchiveResult')
|
||||
|
||||
system_user, _ = User.objects.get_or_create(
|
||||
username='system',
|
||||
defaults={'is_active': False, 'password': '!'}
|
||||
)
|
||||
|
||||
ArchiveResult.objects.filter(created_by__isnull=True).update(created_by=system_user)
|
||||
|
||||
|
||||
def populate_created_by_tag(apps, schema_editor):
|
||||
"""Populate created_by for existing tags."""
|
||||
User = apps.get_model('auth', 'User')
|
||||
Tag = apps.get_model('core', 'Tag')
|
||||
|
||||
system_user, _ = User.objects.get_or_create(
|
||||
username='system',
|
||||
defaults={'is_active': False, 'password': '!'}
|
||||
)
|
||||
|
||||
Tag.objects.filter(created_by__isnull=True).update(created_by=system_user)
|
||||
|
||||
|
||||
def generate_uuid_for_archiveresults(apps, schema_editor):
|
||||
"""Generate UUIDs for archive results that don't have them."""
|
||||
ArchiveResult = apps.get_model('core', 'ArchiveResult')
|
||||
for ar in ArchiveResult.objects.filter(uuid__isnull=True).iterator(chunk_size=500):
|
||||
ar.uuid = uuid4()
|
||||
ar.save(update_fields=['uuid'])
|
||||
|
||||
|
||||
def generate_uuid_for_tags(apps, schema_editor):
|
||||
"""Generate UUIDs for tags that don't have them."""
|
||||
Tag = apps.get_model('core', 'Tag')
|
||||
for tag in Tag.objects.filter(uuid__isnull=True).iterator(chunk_size=500):
|
||||
tag.uuid = uuid4()
|
||||
tag.save(update_fields=['uuid'])
|
||||
|
||||
|
||||
def copy_bookmarked_at_from_added(apps, schema_editor):
|
||||
"""Copy added timestamp to bookmarked_at."""
|
||||
Snapshot = apps.get_model('core', 'Snapshot')
|
||||
Snapshot.objects.filter(bookmarked_at__isnull=True).update(
|
||||
bookmarked_at=models.F('added')
|
||||
)
|
||||
|
||||
|
||||
def copy_created_at_from_added(apps, schema_editor):
|
||||
"""Copy added timestamp to created_at for snapshots."""
|
||||
Snapshot = apps.get_model('core', 'Snapshot')
|
||||
Snapshot.objects.filter(created_at__isnull=True).update(
|
||||
created_at=models.F('added')
|
||||
)
|
||||
|
||||
|
||||
def copy_created_at_from_start_ts(apps, schema_editor):
|
||||
"""Copy start_ts to created_at for archive results."""
|
||||
ArchiveResult = apps.get_model('core', 'ArchiveResult')
|
||||
ArchiveResult.objects.filter(created_at__isnull=True).update(
|
||||
created_at=models.F('start_ts')
|
||||
)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
"""
|
||||
This migration transforms the schema from the main branch (0022) to the new
|
||||
simplified schema without the ABID system.
|
||||
|
||||
For dev branch users who had ABID migrations (0023-0074), this replaces them
|
||||
with a clean transformation.
|
||||
"""
|
||||
|
||||
replaces = [
|
||||
('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
|
||||
('core', '0024_auto_20240513_1143'),
|
||||
('core', '0025_alter_archiveresult_uuid'),
|
||||
('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
|
||||
('core', '0027_update_snapshot_ids'),
|
||||
('core', '0028_alter_archiveresult_uuid'),
|
||||
('core', '0029_alter_archiveresult_id'),
|
||||
('core', '0030_alter_archiveresult_uuid'),
|
||||
('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
|
||||
('core', '0032_alter_archiveresult_id'),
|
||||
('core', '0033_rename_id_archiveresult_old_id'),
|
||||
('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
|
||||
('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
|
||||
('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
|
||||
('core', '0037_rename_id_snapshot_old_id'),
|
||||
('core', '0038_rename_uuid_snapshot_id'),
|
||||
('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
|
||||
('core', '0040_archiveresult_snapshot'),
|
||||
('core', '0041_alter_archiveresult_snapshot_and_more'),
|
||||
('core', '0042_remove_archiveresult_snapshot_old'),
|
||||
('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
|
||||
('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
|
||||
('core', '0045_alter_snapshot_old_id'),
|
||||
('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
|
||||
('core', '0047_alter_snapshottag_unique_together_and_more'),
|
||||
('core', '0048_alter_archiveresult_snapshot_and_more'),
|
||||
('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
|
||||
('core', '0050_alter_snapshottag_snapshot_old'),
|
||||
('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
|
||||
('core', '0052_alter_snapshottag_unique_together_and_more'),
|
||||
('core', '0053_remove_snapshottag_snapshot_old'),
|
||||
('core', '0054_alter_snapshot_timestamp'),
|
||||
('core', '0055_alter_tag_slug'),
|
||||
('core', '0056_remove_tag_uuid'),
|
||||
('core', '0057_rename_id_tag_old_id'),
|
||||
('core', '0058_alter_tag_old_id'),
|
||||
('core', '0059_tag_id'),
|
||||
('core', '0060_alter_tag_id'),
|
||||
('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
|
||||
('core', '0062_alter_snapshottag_old_tag'),
|
||||
('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
|
||||
('core', '0064_alter_snapshottag_unique_together_and_more'),
|
||||
('core', '0065_remove_snapshottag_old_tag'),
|
||||
('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
|
||||
('core', '0067_alter_snapshottag_tag'),
|
||||
('core', '0068_alter_archiveresult_options'),
|
||||
('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
|
||||
('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
|
||||
('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
|
||||
('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
|
||||
('core', '0073_rename_created_archiveresult_created_at_and_more'),
|
||||
('core', '0074_alter_snapshot_downloaded_at'),
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
('core', '0022_auto_20231023_2008'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# === SNAPSHOT CHANGES ===
|
||||
|
||||
# Add new fields to Snapshot
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(
|
||||
default=None, null=True, blank=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name='snapshot_set',
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='bookmarked_at',
|
||||
field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='downloaded_at',
|
||||
field=models.DateTimeField(default=None, null=True, blank=True, db_index=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='depth',
|
||||
field=models.PositiveSmallIntegerField(default=0, db_index=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], default='queued', max_length=15, db_index=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(default=django.utils.timezone.now, null=True, blank=True, db_index=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='config',
|
||||
field=models.JSONField(default=dict, blank=False),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='notes',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='output_dir',
|
||||
field=models.CharField(max_length=256, default=None, null=True, blank=True),
|
||||
),
|
||||
|
||||
# Copy data from old fields to new
|
||||
migrations.RunPython(copy_bookmarked_at_from_added, migrations.RunPython.noop),
|
||||
migrations.RunPython(copy_created_at_from_added, migrations.RunPython.noop),
|
||||
migrations.RunPython(populate_created_by_snapshot, migrations.RunPython.noop),
|
||||
|
||||
# Make created_by non-nullable after population
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name='snapshot_set',
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
db_index=True,
|
||||
),
|
||||
),
|
||||
|
||||
# Update timestamp field constraints
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
field=models.CharField(max_length=32, unique=True, db_index=True, editable=False),
|
||||
),
|
||||
|
||||
# Update title field size
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
field=models.CharField(max_length=512, null=True, blank=True, db_index=True),
|
||||
),
|
||||
|
||||
# Remove old 'added' and 'updated' fields
|
||||
migrations.RemoveField(model_name='snapshot', name='added'),
|
||||
migrations.RemoveField(model_name='snapshot', name='updated'),
|
||||
|
||||
# Remove old 'tags' CharField (now M2M via Tag model)
|
||||
migrations.RemoveField(model_name='snapshot', name='tags'),
|
||||
|
||||
# === TAG CHANGES ===
|
||||
|
||||
# Add uuid field to Tag temporarily for ID migration
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='uuid',
|
||||
field=models.UUIDField(default=uuid4, null=True, blank=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(
|
||||
default=None, null=True, blank=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name='tag_set',
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
|
||||
# Populate UUIDs for tags
|
||||
migrations.RunPython(generate_uuid_for_tags, migrations.RunPython.noop),
|
||||
migrations.RunPython(populate_created_by_tag, migrations.RunPython.noop),
|
||||
|
||||
# Make created_by non-nullable
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name='tag_set',
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
),
|
||||
),
|
||||
|
||||
# Update slug field
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
field=models.SlugField(unique=True, max_length=100, editable=False),
|
||||
),
|
||||
|
||||
# === ARCHIVERESULT CHANGES ===
|
||||
|
||||
# Add uuid field for new ID
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(default=uuid4, null=True, blank=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(
|
||||
default=None, null=True, blank=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name='archiveresult_set',
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(default=django.utils.timezone.now, null=True, blank=True, db_index=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='notes',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_dir',
|
||||
field=models.CharField(max_length=256, default=None, null=True, blank=True),
|
||||
),
|
||||
|
||||
# Populate UUIDs and data for archive results
|
||||
migrations.RunPython(generate_uuid_for_archiveresults, migrations.RunPython.noop),
|
||||
migrations.RunPython(copy_created_at_from_start_ts, migrations.RunPython.noop),
|
||||
migrations.RunPython(populate_created_by_archiveresult, migrations.RunPython.noop),
|
||||
|
||||
# Make created_by non-nullable
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name='archiveresult_set',
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
db_index=True,
|
||||
),
|
||||
),
|
||||
|
||||
# Update extractor choices
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'),
|
||||
('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'),
|
||||
('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'),
|
||||
('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'),
|
||||
('title', 'title'), ('wget', 'wget'),
|
||||
],
|
||||
max_length=32, db_index=True,
|
||||
),
|
||||
),
|
||||
|
||||
# Update status field
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='status',
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'),
|
||||
('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped'),
|
||||
],
|
||||
max_length=16, default='queued', db_index=True,
|
||||
),
|
||||
),
|
||||
|
||||
# Update output field size
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output',
|
||||
field=models.CharField(max_length=1024, default=None, null=True, blank=True),
|
||||
),
|
||||
|
||||
# Update cmd_version field size
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='cmd_version',
|
||||
field=models.CharField(max_length=128, default=None, null=True, blank=True),
|
||||
),
|
||||
|
||||
# Make start_ts and end_ts nullable
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='start_ts',
|
||||
field=models.DateTimeField(default=None, null=True, blank=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='end_ts',
|
||||
field=models.DateTimeField(default=None, null=True, blank=True),
|
||||
),
|
||||
|
||||
# Make pwd nullable
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='pwd',
|
||||
field=models.CharField(max_length=256, default=None, null=True, blank=True),
|
||||
),
|
||||
|
||||
# Make cmd nullable
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='cmd',
|
||||
field=models.JSONField(default=None, null=True, blank=True),
|
||||
),
|
||||
|
||||
# Update model options
|
||||
migrations.AlterModelOptions(
|
||||
name='archiveresult',
|
||||
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='snapshot',
|
||||
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='tag',
|
||||
options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'},
|
||||
),
|
||||
]
|
||||
@@ -1,101 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-05-13 11:43
|
||||
|
||||
from django.db import migrations
|
||||
from datetime import datetime
|
||||
|
||||
from archivebox.base_models.abid import abid_from_values, DEFAULT_ABID_URI_SALT
|
||||
|
||||
|
||||
def calculate_abid(self):
|
||||
"""
|
||||
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
|
||||
"""
|
||||
prefix = self.abid_prefix
|
||||
ts = eval(self.abid_ts_src)
|
||||
uri = eval(self.abid_uri_src)
|
||||
subtype = eval(self.abid_subtype_src)
|
||||
rand = eval(self.abid_rand_src)
|
||||
|
||||
if (not prefix) or prefix == 'obj_':
|
||||
suggested_abid = self.__class__.__name__[:3].lower()
|
||||
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
|
||||
|
||||
if not ts:
|
||||
ts = datetime.utcfromtimestamp(0)
|
||||
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
|
||||
|
||||
if not uri:
|
||||
uri = str(self)
|
||||
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
|
||||
|
||||
if not subtype:
|
||||
subtype = self.__class__.__name__
|
||||
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
|
||||
|
||||
if not rand:
|
||||
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
|
||||
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
|
||||
|
||||
abid = abid_from_values(
|
||||
prefix=prefix,
|
||||
ts=ts,
|
||||
uri=uri,
|
||||
subtype=subtype,
|
||||
rand=rand,
|
||||
salt=DEFAULT_ABID_URI_SALT,
|
||||
)
|
||||
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
|
||||
return abid
|
||||
|
||||
|
||||
def copy_snapshot_uuids(apps, schema_editor):
|
||||
print(' Copying snapshot.id -> snapshot.uuid...')
|
||||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
for snapshot in Snapshot.objects.all():
|
||||
snapshot.uuid = snapshot.id
|
||||
snapshot.save(update_fields=["uuid"])
|
||||
|
||||
def generate_snapshot_abids(apps, schema_editor):
|
||||
print(' Generating snapshot.abid values...')
|
||||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
for snapshot in Snapshot.objects.all():
|
||||
snapshot.abid_prefix = 'snp_'
|
||||
snapshot.abid_ts_src = 'self.added'
|
||||
snapshot.abid_uri_src = 'self.url'
|
||||
snapshot.abid_subtype_src = '"01"'
|
||||
snapshot.abid_rand_src = 'self.uuid'
|
||||
|
||||
snapshot.abid = calculate_abid(snapshot)
|
||||
snapshot.uuid = snapshot.abid.uuid
|
||||
snapshot.save(update_fields=["abid", "uuid"])
|
||||
|
||||
def generate_archiveresult_abids(apps, schema_editor):
|
||||
print(' Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)')
|
||||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
for result in ArchiveResult.objects.all():
|
||||
result.abid_prefix = 'res_'
|
||||
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
|
||||
result.snapshot_added = result.snapshot.added
|
||||
result.snapshot_url = result.snapshot.url
|
||||
result.abid_ts_src = 'self.snapshot_added'
|
||||
result.abid_uri_src = 'self.snapshot_url'
|
||||
result.abid_subtype_src = 'self.extractor'
|
||||
result.abid_rand_src = 'self.id'
|
||||
|
||||
result.abid = calculate_abid(result)
|
||||
result.uuid = result.abid.uuid
|
||||
result.save(update_fields=["abid", "uuid"])
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(copy_snapshot_uuids, reverse_code=migrations.RunPython.noop),
|
||||
migrations.RunPython(generate_snapshot_abids, reverse_code=migrations.RunPython.noop),
|
||||
migrations.RunPython(generate_archiveresult_abids, reverse_code=migrations.RunPython.noop),
|
||||
]
|
||||
@@ -1,19 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-05-13 12:08
|
||||
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0024_auto_20240513_1143'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
|
||||
),
|
||||
]
|
||||
@@ -1,117 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-05-13 13:01
|
||||
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
import archivebox.base_models.models
|
||||
|
||||
|
||||
def updated_created_by_ids(apps, schema_editor):
|
||||
"""Get or create a system user with is_superuser=True to be the default owner for new DB rows"""
|
||||
|
||||
User = apps.get_model("auth", "User")
|
||||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
Tag = apps.get_model("core", "Tag")
|
||||
|
||||
# if only one user exists total, return that user
|
||||
if User.objects.filter(is_superuser=True).count() == 1:
|
||||
user_id = User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0]
|
||||
|
||||
# otherwise, create a dedicated "system" user
|
||||
user_id = User.objects.get_or_create(username='system', is_staff=True, is_superuser=True, defaults={'email': '', 'password': ''})[0].pk
|
||||
|
||||
ArchiveResult.objects.all().update(created_by_id=user_id)
|
||||
Snapshot.objects.all().update(created_by_id=user_id)
|
||||
Tag.objects.all().update(created_by_id=user_id)
|
||||
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0025_alter_archiveresult_uuid'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='created',
|
||||
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
|
||||
preserve_default=False,
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(null=True, default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='modified',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='created',
|
||||
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
|
||||
preserve_default=False,
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(null=True, default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='modified',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created',
|
||||
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
|
||||
preserve_default=False,
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(null=True, default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='modified',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, null=True, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, null=True, unique=True),
|
||||
),
|
||||
|
||||
|
||||
migrations.RunPython(updated_created_by_ids, reverse_code=migrations.RunPython.noop),
|
||||
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
]
|
||||
@@ -1,105 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 02:48
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
from datetime import datetime
|
||||
from archivebox.base_models.abid import ABID, abid_from_values, DEFAULT_ABID_URI_SALT
|
||||
|
||||
|
||||
def calculate_abid(self):
|
||||
"""
|
||||
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
|
||||
"""
|
||||
prefix = self.abid_prefix
|
||||
ts = eval(self.abid_ts_src)
|
||||
uri = eval(self.abid_uri_src)
|
||||
subtype = eval(self.abid_subtype_src)
|
||||
rand = eval(self.abid_rand_src)
|
||||
|
||||
if (not prefix) or prefix == 'obj_':
|
||||
suggested_abid = self.__class__.__name__[:3].lower()
|
||||
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
|
||||
|
||||
if not ts:
|
||||
ts = datetime.utcfromtimestamp(0)
|
||||
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
|
||||
|
||||
if not uri:
|
||||
uri = str(self)
|
||||
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
|
||||
|
||||
if not subtype:
|
||||
subtype = self.__class__.__name__
|
||||
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
|
||||
|
||||
if not rand:
|
||||
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
|
||||
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
|
||||
|
||||
abid = abid_from_values(
|
||||
prefix=prefix,
|
||||
ts=ts,
|
||||
uri=uri,
|
||||
subtype=subtype,
|
||||
rand=rand,
|
||||
salt=DEFAULT_ABID_URI_SALT,
|
||||
)
|
||||
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
|
||||
return abid
|
||||
|
||||
def update_snapshot_ids(apps, schema_editor):
|
||||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
num_total = Snapshot.objects.all().count()
|
||||
print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
|
||||
for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator(chunk_size=500)):
|
||||
assert snapshot.abid
|
||||
snapshot.abid_prefix = 'snp_'
|
||||
snapshot.abid_ts_src = 'self.added'
|
||||
snapshot.abid_uri_src = 'self.url'
|
||||
snapshot.abid_subtype_src = '"01"'
|
||||
snapshot.abid_rand_src = 'self.uuid'
|
||||
|
||||
snapshot.abid = calculate_abid(snapshot)
|
||||
snapshot.uuid = snapshot.abid.uuid
|
||||
snapshot.save(update_fields=["abid", "uuid"])
|
||||
assert str(ABID.parse(snapshot.abid).uuid) == str(snapshot.uuid)
|
||||
if idx % 1000 == 0:
|
||||
print(f'Migrated {idx}/{num_total} Snapshot objects...')
|
||||
|
||||
def update_archiveresult_ids(apps, schema_editor):
|
||||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||
num_total = ArchiveResult.objects.all().count()
|
||||
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
|
||||
for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator(chunk_size=500)):
|
||||
assert result.abid
|
||||
result.abid_prefix = 'res_'
|
||||
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
|
||||
result.snapshot_added = result.snapshot.added
|
||||
result.snapshot_url = result.snapshot.url
|
||||
result.abid_ts_src = 'self.snapshot_added'
|
||||
result.abid_uri_src = 'self.snapshot_url'
|
||||
result.abid_subtype_src = 'self.extractor'
|
||||
result.abid_rand_src = 'self.id'
|
||||
|
||||
result.abid = calculate_abid(result)
|
||||
result.uuid = result.abid.uuid
|
||||
result.uuid = ABID.parse(result.abid).uuid
|
||||
result.save(update_fields=["abid", "uuid"])
|
||||
assert str(ABID.parse(result.abid).uuid) == str(result.uuid)
|
||||
if idx % 5000 == 0:
|
||||
print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(update_snapshot_ids, reverse_code=migrations.RunPython.noop),
|
||||
migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop),
|
||||
]
|
||||
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 04:28
|
||||
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0027_update_snapshot_ids'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(default=uuid.uuid4),
|
||||
),
|
||||
]
|
||||
@@ -1,18 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 04:28
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0028_alter_archiveresult_uuid'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.BigIntegerField(primary_key=True, serialize=False, verbose_name='ID'),
|
||||
),
|
||||
]
|
||||
@@ -1,18 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 05:00
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0029_alter_archiveresult_id'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(unique=True),
|
||||
),
|
||||
]
|
||||
@@ -1,34 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 05:09
|
||||
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0030_alter_archiveresult_uuid'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.IntegerField(default=uuid.uuid4, primary_key=True, serialize=False, verbose_name='ID'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(default=uuid.uuid4, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='uuid',
|
||||
field=models.UUIDField(default=uuid.uuid4, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='uuid',
|
||||
field=models.UUIDField(default=uuid.uuid4, null=True, unique=True),
|
||||
),
|
||||
]
|
||||
@@ -1,23 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 05:20
|
||||
|
||||
import core.models
|
||||
import random
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
def rand_int_id():
|
||||
return random.getrandbits(32)
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='ID'),
|
||||
),
|
||||
]
|
||||
@@ -1,18 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 05:34
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0032_alter_archiveresult_id'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='archiveresult',
|
||||
old_name='id',
|
||||
new_name='old_id',
|
||||
),
|
||||
]
|
||||
@@ -1,45 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 05:37
|
||||
|
||||
import uuid
|
||||
import random
|
||||
from django.db import migrations, models
|
||||
|
||||
from archivebox.base_models.abid import ABID
|
||||
|
||||
|
||||
def rand_int_id():
|
||||
return random.getrandbits(32)
|
||||
|
||||
|
||||
def update_archiveresult_ids(apps, schema_editor):
|
||||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||
num_total = ArchiveResult.objects.all().count()
|
||||
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
|
||||
for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator(chunk_size=500)):
|
||||
assert result.abid
|
||||
result.uuid = ABID.parse(result.abid).uuid
|
||||
result.save(update_fields=["uuid"])
|
||||
assert str(ABID.parse(result.abid).uuid) == str(result.uuid)
|
||||
if idx % 2500 == 0:
|
||||
print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0033_rename_id_archiveresult_old_id'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='old_id',
|
||||
field=models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='ID'),
|
||||
),
|
||||
migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
]
|
||||
@@ -1,19 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 05:49
|
||||
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='archiveresult',
|
||||
old_name='uuid',
|
||||
new_name='id',
|
||||
),
|
||||
]
|
||||
@@ -1,29 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 05:59
|
||||
|
||||
import core.models
|
||||
import uuid
|
||||
import random
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
def rand_int_id():
|
||||
return random.getrandbits(32)
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='old_id',
|
||||
field=models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID'),
|
||||
),
|
||||
]
|
||||
@@ -1,18 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 06:08
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='snapshot',
|
||||
old_name='id',
|
||||
new_name='old_id',
|
||||
),
|
||||
]
|
||||
@@ -1,18 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 06:09
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0037_rename_id_snapshot_old_id'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='snapshot',
|
||||
old_name='uuid',
|
||||
new_name='id',
|
||||
),
|
||||
]
|
||||
@@ -1,18 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 06:25
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0038_rename_uuid_snapshot_id'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='archiveresult',
|
||||
old_name='snapshot',
|
||||
new_name='snapshot_old',
|
||||
),
|
||||
]
|
||||
@@ -1,34 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 06:46
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
def update_archiveresult_snapshot_ids(apps, schema_editor):
|
||||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
num_total = ArchiveResult.objects.all().count()
|
||||
print(f' Updating {num_total} ArchiveResult.snapshot_id values in place... (may take an hour or longer for large collections...)')
|
||||
for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator(chunk_size=5000)):
|
||||
assert result.snapshot_old_id
|
||||
snapshot = Snapshot.objects.only('id').get(old_id=result.snapshot_old_id)
|
||||
result.snapshot_id = snapshot.id
|
||||
result.save(update_fields=["snapshot_id"])
|
||||
assert str(result.snapshot_id) == str(snapshot.id)
|
||||
if idx % 5000 == 0:
|
||||
print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='snapshot',
|
||||
field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresults', to='core.snapshot', to_field='id'),
|
||||
),
|
||||
migrations.RunPython(update_archiveresult_snapshot_ids, reverse_code=migrations.RunPython.noop),
|
||||
]
|
||||
@@ -1,24 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 06:50
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0040_archiveresult_snapshot'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='snapshot',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='snapshot_old',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='archiveresults_old', to='core.snapshot'),
|
||||
),
|
||||
]
|
||||
@@ -1,17 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 06:51
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0041_alter_archiveresult_snapshot_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='snapshot_old',
|
||||
),
|
||||
]
|
||||
@@ -1,20 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-18 06:52
|
||||
|
||||
import django.db.models.deletion
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0042_remove_archiveresult_snapshot_old'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='snapshot',
|
||||
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
|
||||
),
|
||||
]
|
||||
@@ -1,40 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-19 23:01
|
||||
|
||||
import django.db.models.deletion
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.SeparateDatabaseAndState(
|
||||
database_operations=[
|
||||
# No-op, SnapshotTag model already exists in DB
|
||||
],
|
||||
state_operations=[
|
||||
migrations.CreateModel(
|
||||
name='SnapshotTag',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')),
|
||||
('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')),
|
||||
],
|
||||
options={
|
||||
'db_table': 'core_snapshot_tags',
|
||||
'unique_together': {('snapshot', 'tag')},
|
||||
},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -1,19 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 01:54
|
||||
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='old_id',
|
||||
field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
]
|
||||
@@ -1,30 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 01:55
|
||||
|
||||
import django.db.models.deletion
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0045_alter_snapshot_old_id'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='snapshot',
|
||||
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='old_id',
|
||||
field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
|
||||
),
|
||||
]
|
||||
@@ -1,24 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 02:16
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='snapshot',
|
||||
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='tag',
|
||||
field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
|
||||
),
|
||||
]
|
||||
@@ -1,24 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 02:17
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0047_alter_snapshottag_unique_together_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='snapshot',
|
||||
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='snapshot',
|
||||
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'),
|
||||
),
|
||||
]
|
||||
@@ -1,22 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 02:26
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0048_alter_archiveresult_snapshot_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='snapshottag',
|
||||
old_name='snapshot',
|
||||
new_name='snapshot_old',
|
||||
),
|
||||
migrations.AlterUniqueTogether(
|
||||
name='snapshottag',
|
||||
unique_together={('snapshot_old', 'tag')},
|
||||
),
|
||||
]
|
||||
@@ -1,19 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 02:30
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='snapshot_old',
|
||||
field=models.ForeignKey(db_column='snapshot_old_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'),
|
||||
),
|
||||
]
|
||||
@@ -1,40 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 02:31
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
def update_snapshottag_ids(apps, schema_editor):
|
||||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
SnapshotTag = apps.get_model("core", "SnapshotTag")
|
||||
num_total = SnapshotTag.objects.all().count()
|
||||
print(f' Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)')
|
||||
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator(chunk_size=500)):
|
||||
assert snapshottag.snapshot_old_id
|
||||
snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id)
|
||||
snapshottag.snapshot_id = snapshot.id
|
||||
snapshottag.save(update_fields=["snapshot_id"])
|
||||
assert str(snapshottag.snapshot_id) == str(snapshot.id)
|
||||
if idx % 100 == 0:
|
||||
print(f'Migrated {idx}/{num_total} SnapshotTag objects...')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0050_alter_snapshottag_snapshot_old'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='snapshottag',
|
||||
name='snapshot',
|
||||
field=models.ForeignKey(blank=True, db_column='snapshot_id', null=True, on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='snapshot_old',
|
||||
field=models.ForeignKey(db_column='snapshot_old_id', on_delete=django.db.models.deletion.CASCADE, related_name='snapshottag_old_set', to='core.snapshot', to_field='old_id'),
|
||||
),
|
||||
migrations.RunPython(update_snapshottag_ids, reverse_code=migrations.RunPython.noop),
|
||||
]
|
||||
@@ -1,27 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 02:37
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterUniqueTogether(
|
||||
name='snapshottag',
|
||||
unique_together=set(),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='snapshot',
|
||||
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
|
||||
),
|
||||
migrations.AlterUniqueTogether(
|
||||
name='snapshottag',
|
||||
unique_together={('snapshot', 'tag')},
|
||||
),
|
||||
]
|
||||
@@ -1,17 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 02:38
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0052_alter_snapshottag_unique_together_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='snapshottag',
|
||||
name='snapshot_old',
|
||||
),
|
||||
]
|
||||
@@ -1,18 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 02:40
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0053_remove_snapshottag_snapshot_old'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
|
||||
),
|
||||
]
|
||||
@@ -1,18 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 03:24
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0054_alter_snapshot_timestamp'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
field=models.SlugField(editable=False, max_length=100, unique=True),
|
||||
),
|
||||
]
|
||||
@@ -1,17 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 03:25
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0055_alter_tag_slug'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='tag',
|
||||
name='uuid',
|
||||
),
|
||||
]
|
||||
@@ -1,18 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 03:29
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0056_remove_tag_uuid'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='tag',
|
||||
old_name='id',
|
||||
new_name='old_id',
|
||||
),
|
||||
]
|
||||
@@ -1,22 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 03:30
|
||||
|
||||
import random
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
def rand_int_id():
|
||||
return random.getrandbits(32)
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0057_rename_id_tag_old_id'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='old_id',
|
||||
field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='Old ID'),
|
||||
),
|
||||
]
|
||||
@@ -1,90 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 03:33
|
||||
|
||||
from datetime import datetime
|
||||
from django.db import migrations, models
|
||||
from archivebox.base_models.abid import abid_from_values
|
||||
from archivebox.base_models.models import ABID
|
||||
|
||||
def calculate_abid(self):
|
||||
"""
|
||||
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
|
||||
"""
|
||||
prefix = self.abid_prefix
|
||||
ts = eval(self.abid_ts_src)
|
||||
uri = eval(self.abid_uri_src)
|
||||
subtype = eval(self.abid_subtype_src)
|
||||
rand = eval(self.abid_rand_src)
|
||||
|
||||
if (not prefix) or prefix == 'obj_':
|
||||
suggested_abid = self.__class__.__name__[:3].lower()
|
||||
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
|
||||
|
||||
if not ts:
|
||||
ts = datetime.utcfromtimestamp(0)
|
||||
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
|
||||
|
||||
if not uri:
|
||||
uri = str(self)
|
||||
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
|
||||
|
||||
if not subtype:
|
||||
subtype = self.__class__.__name__
|
||||
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
|
||||
|
||||
if not rand:
|
||||
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
|
||||
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
|
||||
|
||||
abid = abid_from_values(
|
||||
prefix=prefix,
|
||||
ts=ts,
|
||||
uri=uri,
|
||||
subtype=subtype,
|
||||
rand=rand,
|
||||
)
|
||||
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
|
||||
return abid
|
||||
|
||||
|
||||
def update_archiveresult_ids(apps, schema_editor):
|
||||
Tag = apps.get_model("core", "Tag")
|
||||
num_total = Tag.objects.all().count()
|
||||
print(f' Updating {num_total} Tag.id, ArchiveResult.uuid values in place...')
|
||||
for idx, tag in enumerate(Tag.objects.all().iterator(chunk_size=500)):
|
||||
if not tag.slug:
|
||||
tag.slug = tag.name.lower().replace(' ', '_')
|
||||
if not tag.name:
|
||||
tag.name = tag.slug
|
||||
if not (tag.name or tag.slug):
|
||||
tag.delete()
|
||||
continue
|
||||
|
||||
assert tag.slug or tag.name, f'Tag.slug must be defined! You have a Tag(id={tag.pk}) missing a slug!'
|
||||
tag.abid_prefix = 'tag_'
|
||||
tag.abid_ts_src = 'self.created'
|
||||
tag.abid_uri_src = 'self.slug'
|
||||
tag.abid_subtype_src = '"03"'
|
||||
tag.abid_rand_src = 'self.old_id'
|
||||
tag.abid = calculate_abid(tag)
|
||||
tag.id = tag.abid.uuid
|
||||
tag.save(update_fields=["abid", "id", "name", "slug"])
|
||||
assert str(ABID.parse(tag.abid).uuid) == str(tag.id)
|
||||
if idx % 10 == 0:
|
||||
print(f'Migrated {idx}/{num_total} Tag objects...')
|
||||
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0058_alter_tag_old_id'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='id',
|
||||
field=models.UUIDField(blank=True, null=True),
|
||||
),
|
||||
migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop),
|
||||
]
|
||||
@@ -1,19 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 03:42
|
||||
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0059_tag_id'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
|
||||
),
|
||||
]
|
||||
@@ -1,22 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 03:43
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0060_alter_tag_id'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='snapshottag',
|
||||
old_name='tag',
|
||||
new_name='old_tag',
|
||||
),
|
||||
migrations.AlterUniqueTogether(
|
||||
name='snapshottag',
|
||||
unique_together={('snapshot', 'old_tag')},
|
||||
),
|
||||
]
|
||||
@@ -1,19 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 03:44
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='old_tag',
|
||||
field=models.ForeignKey(db_column='old_tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
|
||||
),
|
||||
]
|
||||
@@ -1,40 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 03:45
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
def update_snapshottag_ids(apps, schema_editor):
|
||||
Tag = apps.get_model("core", "Tag")
|
||||
SnapshotTag = apps.get_model("core", "SnapshotTag")
|
||||
num_total = SnapshotTag.objects.all().count()
|
||||
print(f' Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)')
|
||||
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator(chunk_size=500)):
|
||||
assert snapshottag.old_tag_id
|
||||
tag = Tag.objects.get(old_id=snapshottag.old_tag_id)
|
||||
snapshottag.tag_id = tag.id
|
||||
snapshottag.save(update_fields=["tag_id"])
|
||||
assert str(snapshottag.tag_id) == str(tag.id)
|
||||
if idx % 100 == 0:
|
||||
print(f'Migrated {idx}/{num_total} SnapshotTag objects...')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0062_alter_snapshottag_old_tag'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='snapshottag',
|
||||
name='tag',
|
||||
field=models.ForeignKey(blank=True, db_column='tag_id', null=True, on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='old_tag',
|
||||
field=models.ForeignKey(db_column='old_tag_id', on_delete=django.db.models.deletion.CASCADE, related_name='snapshottags_old', to='core.tag'),
|
||||
),
|
||||
migrations.RunPython(update_snapshottag_ids, reverse_code=migrations.RunPython.noop),
|
||||
]
|
||||
@@ -1,27 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 03:50
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterUniqueTogether(
|
||||
name='snapshottag',
|
||||
unique_together=set(),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='tag',
|
||||
field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
|
||||
),
|
||||
migrations.AlterUniqueTogether(
|
||||
name='snapshottag',
|
||||
unique_together={('snapshot', 'tag')},
|
||||
),
|
||||
]
|
||||
@@ -1,17 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 03:51
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0064_alter_snapshottag_unique_together_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='snapshottag',
|
||||
name='old_tag',
|
||||
),
|
||||
]
|
||||
@@ -1,34 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 03:52
|
||||
|
||||
import core.models
|
||||
import django.db.models.deletion
|
||||
import uuid
|
||||
import random
|
||||
from django.db import migrations, models
|
||||
|
||||
def rand_int_id():
|
||||
return random.getrandbits(32)
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0065_remove_snapshottag_old_tag'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='tag',
|
||||
field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='old_id',
|
||||
field=models.BigIntegerField(default=rand_int_id, serialize=False, unique=True, verbose_name='Old ID'),
|
||||
),
|
||||
]
|
||||
@@ -1,19 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 03:53
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='tag',
|
||||
field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
|
||||
),
|
||||
]
|
||||
@@ -1,17 +0,0 @@
|
||||
# Generated by Django 5.0.6 on 2024-08-20 07:26
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0067_alter_snapshottag_tag'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='archiveresult',
|
||||
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
|
||||
),
|
||||
]
|
||||
@@ -1,36 +0,0 @@
|
||||
# Generated by Django 5.1 on 2024-08-28 09:40
|
||||
|
||||
import django.utils.timezone
|
||||
from django.db import migrations
|
||||
|
||||
import archivebox.base_models.models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0068_alter_archiveresult_options'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='created',
|
||||
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='added',
|
||||
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='created',
|
||||
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='created',
|
||||
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
]
|
||||
@@ -1,53 +0,0 @@
|
||||
# Generated by Django 5.1 on 2024-09-04 09:00
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
import archivebox.base_models.models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='added',
|
||||
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='created',
|
||||
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=None, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='old_id',
|
||||
field=models.UUIDField(default=None, editable=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='created',
|
||||
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
|
||||
),
|
||||
]
|
||||
@@ -1,66 +0,0 @@
|
||||
# Generated by Django 5.1 on 2024-09-04 23:23
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
import archivebox.base_models.models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='old_id',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='snapshot',
|
||||
name='old_id',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='tag',
|
||||
name='old_id',
|
||||
),
|
||||
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='created',
|
||||
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='id',
|
||||
field=models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
|
||||
),
|
||||
]
|
||||
@@ -1,23 +0,0 @@
|
||||
# Generated by Django 5.1 on 2024-09-05 00:05
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='snapshot',
|
||||
old_name='added',
|
||||
new_name='bookmarked_at',
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name='snapshot',
|
||||
old_name='updated',
|
||||
new_name='downloaded_at',
|
||||
),
|
||||
]
|
||||
@@ -1,43 +0,0 @@
|
||||
# Generated by Django 5.1 on 2024-09-05 00:25
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='archiveresult',
|
||||
old_name='created',
|
||||
new_name='created_at',
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name='archiveresult',
|
||||
old_name='modified',
|
||||
new_name='modified_at',
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name='snapshot',
|
||||
old_name='created',
|
||||
new_name='created_at',
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name='snapshot',
|
||||
old_name='modified',
|
||||
new_name='modified_at',
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name='tag',
|
||||
old_name='created',
|
||||
new_name='created_at',
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name='tag',
|
||||
old_name='modified',
|
||||
new_name='modified_at',
|
||||
),
|
||||
]
|
||||
@@ -1,18 +0,0 @@
|
||||
# Generated by Django 5.1 on 2024-09-05 01:24
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0073_rename_created_archiveresult_created_at_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='downloaded_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
|
||||
),
|
||||
]
|
||||
@@ -1,7 +1,8 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
from typing import Optional, Dict, Iterable, Any
|
||||
from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
|
||||
from uuid import uuid7
|
||||
from datetime import datetime, timedelta
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
import os
|
||||
@@ -18,15 +19,11 @@ from django.urls import reverse, reverse_lazy
|
||||
from django.contrib import admin
|
||||
from django.conf import settings
|
||||
|
||||
import abx
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.misc.system import get_dir_size
|
||||
from archivebox.misc.util import parse_date, base_url, domain as url_domain
|
||||
from archivebox.misc.system import get_dir_size, atomic_write
|
||||
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
|
||||
from archivebox.misc.hashing import get_dir_info
|
||||
from archivebox.index.schema import Link
|
||||
from archivebox.index.html import snapshot_icons
|
||||
from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||
from archivebox.hooks import ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||
from archivebox.base_models.models import (
|
||||
ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
|
||||
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
|
||||
@@ -38,6 +35,7 @@ from crawls.models import Crawl
|
||||
from machine.models import NetworkInterface
|
||||
|
||||
|
||||
|
||||
class Tag(ModelWithSerializers):
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set')
|
||||
@@ -94,8 +92,181 @@ class SnapshotManager(models.Manager):
|
||||
def get_queryset(self):
|
||||
return super().get_queryset().prefetch_related('tags', 'archiveresult_set')
|
||||
|
||||
# =========================================================================
|
||||
# Filtering Methods
|
||||
# =========================================================================
|
||||
|
||||
class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
FILTER_TYPES = {
|
||||
'exact': lambda pattern: models.Q(url=pattern),
|
||||
'substring': lambda pattern: models.Q(url__icontains=pattern),
|
||||
'regex': lambda pattern: models.Q(url__iregex=pattern),
|
||||
'domain': lambda pattern: models.Q(url__istartswith=f"http://{pattern}") | models.Q(url__istartswith=f"https://{pattern}") | models.Q(url__istartswith=f"ftp://{pattern}"),
|
||||
'tag': lambda pattern: models.Q(tags__name=pattern),
|
||||
'timestamp': lambda pattern: models.Q(timestamp=pattern),
|
||||
}
|
||||
|
||||
def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> QuerySet:
|
||||
"""Filter snapshots by URL patterns using specified filter type"""
|
||||
from archivebox.misc.logging import stderr
|
||||
|
||||
q_filter = models.Q()
|
||||
for pattern in patterns:
|
||||
try:
|
||||
q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern)
|
||||
except KeyError:
|
||||
stderr()
|
||||
stderr(f'[X] Got invalid pattern for --filter-type={filter_type}:', color='red')
|
||||
stderr(f' {pattern}')
|
||||
raise SystemExit(2)
|
||||
return self.filter(q_filter)
|
||||
|
||||
def search(self, patterns: List[str]) -> QuerySet:
|
||||
"""Search snapshots using the configured search backend"""
|
||||
from archivebox.config.common import SEARCH_BACKEND_CONFIG
|
||||
from archivebox.search import query_search_index
|
||||
from archivebox.misc.logging import stderr
|
||||
|
||||
if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
|
||||
stderr()
|
||||
stderr('[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red')
|
||||
raise SystemExit(2)
|
||||
|
||||
qsearch = self.none()
|
||||
for pattern in patterns:
|
||||
try:
|
||||
qsearch |= query_search_index(pattern)
|
||||
except:
|
||||
raise SystemExit(2)
|
||||
return self.all() & qsearch
|
||||
|
||||
# =========================================================================
|
||||
# Export Methods
|
||||
# =========================================================================
|
||||
|
||||
def to_json(self, with_headers: bool = False) -> str:
|
||||
"""Generate JSON index from snapshots"""
|
||||
import sys
|
||||
from datetime import datetime, timezone as tz
|
||||
from archivebox.config import VERSION
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
|
||||
MAIN_INDEX_HEADER = {
|
||||
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
|
||||
'schema': 'archivebox.index.json',
|
||||
'copyright_info': SERVER_CONFIG.FOOTER_INFO,
|
||||
'meta': {
|
||||
'project': 'ArchiveBox',
|
||||
'version': VERSION,
|
||||
'git_sha': VERSION,
|
||||
'website': 'https://ArchiveBox.io',
|
||||
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
|
||||
'source': 'https://github.com/ArchiveBox/ArchiveBox',
|
||||
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
|
||||
'dependencies': {},
|
||||
},
|
||||
} if with_headers else {}
|
||||
|
||||
snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)]
|
||||
|
||||
if with_headers:
|
||||
output = {
|
||||
**MAIN_INDEX_HEADER,
|
||||
'num_links': len(snapshot_dicts),
|
||||
'updated': datetime.now(tz.utc),
|
||||
'last_run_cmd': sys.argv,
|
||||
'links': snapshot_dicts,
|
||||
}
|
||||
else:
|
||||
output = snapshot_dicts
|
||||
return to_json(output, indent=4, sort_keys=True)
|
||||
|
||||
def to_csv(self, cols: Optional[List[str]] = None, header: bool = True, separator: str = ',', ljust: int = 0) -> str:
|
||||
"""Generate CSV output from snapshots"""
|
||||
cols = cols or ['timestamp', 'is_archived', 'url']
|
||||
header_str = separator.join(col.ljust(ljust) for col in cols) if header else ''
|
||||
row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500))
|
||||
return '\n'.join((header_str, *row_strs))
|
||||
|
||||
def to_html(self, with_headers: bool = True) -> str:
|
||||
"""Generate main index HTML from snapshots"""
|
||||
from datetime import datetime, timezone as tz
|
||||
from django.template.loader import render_to_string
|
||||
from archivebox.config import VERSION
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.version import get_COMMIT_HASH
|
||||
|
||||
template = 'static_index.html' if with_headers else 'minimal_index.html'
|
||||
snapshot_list = list(self.iterator(chunk_size=500))
|
||||
|
||||
return render_to_string(template, {
|
||||
'version': VERSION,
|
||||
'git_sha': get_COMMIT_HASH() or VERSION,
|
||||
'num_links': str(len(snapshot_list)),
|
||||
'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
|
||||
'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
|
||||
'links': snapshot_list,
|
||||
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
|
||||
})
|
||||
|
||||
# =========================================================================
|
||||
# Import Methods
|
||||
# =========================================================================
|
||||
|
||||
def create_or_update_from_dict(self, link_dict: Dict[str, Any], created_by_id: Optional[int] = None) -> 'Snapshot':
|
||||
"""Create or update a Snapshot from a SnapshotDict (parser output)"""
|
||||
import re
|
||||
from archivebox.config.common import GENERAL_CONFIG
|
||||
|
||||
url = link_dict['url']
|
||||
timestamp = link_dict.get('timestamp')
|
||||
title = link_dict.get('title')
|
||||
tags_str = link_dict.get('tags')
|
||||
|
||||
tag_list = []
|
||||
if tags_str:
|
||||
tag_list = list(dict.fromkeys(
|
||||
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
|
||||
if tag.strip()
|
||||
))
|
||||
|
||||
try:
|
||||
snapshot = self.get(url=url)
|
||||
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
|
||||
snapshot.title = title
|
||||
snapshot.save(update_fields=['title', 'modified_at'])
|
||||
except self.model.DoesNotExist:
|
||||
if timestamp:
|
||||
while self.filter(timestamp=timestamp).exists():
|
||||
timestamp = str(float(timestamp) + 1.0)
|
||||
|
||||
snapshot = self.create(
|
||||
url=url,
|
||||
timestamp=timestamp,
|
||||
title=title,
|
||||
created_by_id=created_by_id or get_or_create_system_user_pk(),
|
||||
)
|
||||
|
||||
if tag_list:
|
||||
existing_tags = set(snapshot.tags.values_list('name', flat=True))
|
||||
new_tags = set(tag_list) | existing_tags
|
||||
snapshot.save_tags(new_tags)
|
||||
|
||||
return snapshot
|
||||
|
||||
def create_from_dicts(self, link_dicts: List[Dict[str, Any]], created_by_id: Optional[int] = None) -> List['Snapshot']:
|
||||
"""Create or update multiple Snapshots from a list of SnapshotDicts"""
|
||||
return [self.create_or_update_from_dict(d, created_by_id=created_by_id) for d in link_dicts]
|
||||
|
||||
def remove(self, atomic: bool = False) -> tuple:
|
||||
"""Remove snapshots from the database"""
|
||||
from django.db import transaction
|
||||
if atomic:
|
||||
with transaction.atomic():
|
||||
return self.delete()
|
||||
return self.delete()
|
||||
|
||||
|
||||
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
@@ -108,6 +279,7 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
|
||||
|
||||
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
|
||||
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
|
||||
depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs
|
||||
|
||||
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
|
||||
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
|
||||
@@ -152,9 +324,6 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
|
||||
def archive(self, overwrite=False, methods=None):
|
||||
return bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
|
||||
|
||||
def as_link(self) -> Link:
|
||||
return Link.from_json(self.as_json())
|
||||
|
||||
@admin.display(description='Tags')
|
||||
def tags_str(self, nocache=True) -> str | None:
|
||||
calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
|
||||
@@ -164,7 +333,55 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
|
||||
return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
|
||||
|
||||
def icons(self) -> str:
|
||||
return snapshot_icons(self)
|
||||
"""Generate HTML icons showing which extractors have succeeded for this snapshot"""
|
||||
from django.utils.html import format_html, mark_safe
|
||||
from collections import defaultdict
|
||||
|
||||
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
|
||||
|
||||
def calc_icons():
|
||||
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
|
||||
archive_results = [r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output]
|
||||
else:
|
||||
archive_results = self.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
||||
|
||||
path = self.archive_path
|
||||
canon = self.canonical_outputs()
|
||||
output = ""
|
||||
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> '
|
||||
icons = {
|
||||
"singlefile": "❶", "wget": "🆆", "dom": "🅷", "pdf": "📄",
|
||||
"screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛",
|
||||
"readability": "🆁", "mercury": "🅼", "warc": "📦"
|
||||
}
|
||||
exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
|
||||
|
||||
extractor_outputs = defaultdict(lambda: None)
|
||||
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
|
||||
for result in archive_results:
|
||||
if result.extractor == extractor:
|
||||
extractor_outputs[extractor] = result
|
||||
|
||||
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
|
||||
if extractor not in exclude:
|
||||
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
output += format_html(output_template, path, canon.get(extractor, ''), str(bool(existing)), extractor, icons.get(extractor, "?"))
|
||||
if extractor == "wget":
|
||||
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
output += format_html(output_template, path, canon.get("warc", "warc/"), str(bool(exists)), "warc", icons.get("warc", "?"))
|
||||
if extractor == "archive_org":
|
||||
exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon.get("archive_org", ""), str(exists), "archive_org", icons.get("archive_org", "?"))
|
||||
|
||||
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
|
||||
|
||||
cache_result = cache.get(cache_key)
|
||||
if cache_result:
|
||||
return cache_result
|
||||
|
||||
fresh_result = calc_icons()
|
||||
cache.set(cache_key, fresh_result, timeout=60 * 60 * 24)
|
||||
return fresh_result
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
@@ -178,7 +395,8 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
|
||||
return url_domain(self.url)
|
||||
|
||||
@cached_property
|
||||
def link_dir(self):
|
||||
def output_dir(self):
|
||||
"""The filesystem path to the snapshot's output directory."""
|
||||
return str(CONSTANTS.ARCHIVE_DIR / self.timestamp)
|
||||
|
||||
@cached_property
|
||||
@@ -188,7 +406,7 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
|
||||
@cached_property
|
||||
def archive_size(self):
|
||||
try:
|
||||
return get_dir_size(self.link_dir)[0]
|
||||
return get_dir_size(self.output_dir)[0]
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
@@ -200,20 +418,327 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
|
||||
def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
|
||||
return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
|
||||
|
||||
def run(self) -> list['ArchiveResult']:
|
||||
"""
|
||||
Execute this Snapshot by creating ArchiveResults for all enabled extractors.
|
||||
|
||||
Called by the state machine when entering the 'started' state.
|
||||
"""
|
||||
return self.create_pending_archiveresults()
|
||||
|
||||
def create_pending_archiveresults(self) -> list['ArchiveResult']:
|
||||
ALL_EXTRACTORS = ['favicon', 'title', 'screenshot', 'headers', 'singlefile', 'dom', 'git', 'archive_org', 'readability', 'mercury', 'pdf', 'wget']
|
||||
"""
|
||||
Create ArchiveResult records for all enabled extractors.
|
||||
|
||||
Uses the hooks system to discover available extractors from:
|
||||
- archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
|
||||
- data/plugins/*/on_Snapshot__*.{py,sh,js}
|
||||
"""
|
||||
from archivebox.hooks import get_enabled_extractors
|
||||
|
||||
extractors = get_enabled_extractors()
|
||||
archiveresults = []
|
||||
for extractor in ALL_EXTRACTORS:
|
||||
|
||||
for extractor in extractors:
|
||||
if ArchiveResult.objects.filter(snapshot=self, extractor=extractor).exists():
|
||||
continue
|
||||
archiveresult, _ = ArchiveResult.objects.get_or_create(
|
||||
snapshot=self, extractor=extractor,
|
||||
defaults={'status': ArchiveResult.INITIAL_STATE, 'retry_at': timezone.now()},
|
||||
defaults={
|
||||
'status': ArchiveResult.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
'created_by_id': self.created_by_id,
|
||||
},
|
||||
)
|
||||
if archiveresult.status == ArchiveResult.INITIAL_STATE:
|
||||
archiveresults.append(archiveresult)
|
||||
return archiveresults
|
||||
|
||||
def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
|
||||
"""
|
||||
Reset failed/skipped ArchiveResults to queued for retry.
|
||||
|
||||
This enables seamless retry of the entire extraction pipeline:
|
||||
- Resets FAILED and SKIPPED results to QUEUED
|
||||
- Sets retry_at so workers pick them up
|
||||
- Extractors run in order (numeric prefix)
|
||||
- Each extractor checks its dependencies at runtime
|
||||
|
||||
Dependency handling (e.g., chrome_session → screenshot):
|
||||
- Extractors check if required outputs exist before running
|
||||
- If dependency output missing → extractor returns 'skipped'
|
||||
- On retry, if dependency now succeeds → dependent can run
|
||||
|
||||
Returns count of ArchiveResults reset.
|
||||
"""
|
||||
retry_at = retry_at or timezone.now()
|
||||
|
||||
count = self.archiveresult_set.filter(
|
||||
status__in=[
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
]
|
||||
).update(
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
retry_at=retry_at,
|
||||
output=None,
|
||||
start_ts=None,
|
||||
end_ts=None,
|
||||
)
|
||||
|
||||
# Also reset the snapshot so it gets re-checked
|
||||
if count > 0:
|
||||
self.status = self.StatusChoices.STARTED
|
||||
self.retry_at = retry_at
|
||||
self.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
|
||||
return count
|
||||
|
||||
# =========================================================================
|
||||
# URL Helper Properties (migrated from Link schema)
|
||||
# =========================================================================
|
||||
|
||||
@cached_property
|
||||
def url_hash(self) -> str:
|
||||
from hashlib import sha256
|
||||
return sha256(self.url.encode()).hexdigest()[:8]
|
||||
|
||||
@cached_property
|
||||
def scheme(self) -> str:
|
||||
return self.url.split('://')[0]
|
||||
|
||||
@cached_property
|
||||
def path(self) -> str:
|
||||
parts = self.url.split('://', 1)
|
||||
return '/' + parts[1].split('/', 1)[1] if len(parts) > 1 and '/' in parts[1] else '/'
|
||||
|
||||
@cached_property
|
||||
def basename(self) -> str:
|
||||
return self.path.split('/')[-1]
|
||||
|
||||
@cached_property
|
||||
def extension(self) -> str:
|
||||
basename = self.basename
|
||||
return basename.split('.')[-1] if '.' in basename else ''
|
||||
|
||||
@cached_property
|
||||
def base_url(self) -> str:
|
||||
return f'{self.scheme}://{self.domain}'
|
||||
|
||||
@cached_property
|
||||
def is_static(self) -> bool:
|
||||
static_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.mp4', '.mp3', '.wav', '.webm'}
|
||||
return any(self.url.lower().endswith(ext) for ext in static_extensions)
|
||||
|
||||
@cached_property
|
||||
def is_archived(self) -> bool:
|
||||
output_paths = (
|
||||
self.domain,
|
||||
'output.html',
|
||||
'output.pdf',
|
||||
'screenshot.png',
|
||||
'singlefile.html',
|
||||
'readability/content.html',
|
||||
'mercury/content.html',
|
||||
'htmltotext.txt',
|
||||
'media',
|
||||
'git',
|
||||
)
|
||||
return any((Path(self.output_dir) / path).exists() for path in output_paths)
|
||||
|
||||
# =========================================================================
|
||||
# Date/Time Properties (migrated from Link schema)
|
||||
# =========================================================================
|
||||
|
||||
@cached_property
|
||||
def bookmarked_date(self) -> Optional[str]:
|
||||
max_ts = (timezone.now() + timedelta(days=30)).timestamp()
|
||||
if self.timestamp and self.timestamp.replace('.', '').isdigit():
|
||||
if 0 < float(self.timestamp) < max_ts:
|
||||
return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp)))
|
||||
return str(self.timestamp)
|
||||
return None
|
||||
|
||||
@cached_property
|
||||
def downloaded_datestr(self) -> Optional[str]:
|
||||
return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None
|
||||
|
||||
@cached_property
|
||||
def archive_dates(self) -> List[datetime]:
|
||||
return [
|
||||
result.start_ts
|
||||
for result in self.archiveresult_set.all()
|
||||
if result.start_ts
|
||||
]
|
||||
|
||||
@cached_property
|
||||
def oldest_archive_date(self) -> Optional[datetime]:
|
||||
dates = self.archive_dates
|
||||
return min(dates) if dates else None
|
||||
|
||||
@cached_property
|
||||
def newest_archive_date(self) -> Optional[datetime]:
|
||||
dates = self.archive_dates
|
||||
return max(dates) if dates else None
|
||||
|
||||
@cached_property
|
||||
def num_outputs(self) -> int:
|
||||
return self.archiveresult_set.filter(status='succeeded').count()
|
||||
|
||||
@cached_property
|
||||
def num_failures(self) -> int:
|
||||
return self.archiveresult_set.filter(status='failed').count()
|
||||
|
||||
# =========================================================================
|
||||
# Output Path Methods (migrated from Link schema)
|
||||
# =========================================================================
|
||||
|
||||
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
||||
"""Predict the expected output paths that should be present after archiving"""
|
||||
FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
|
||||
canonical = {
|
||||
'index_path': 'index.html',
|
||||
'favicon_path': 'favicon.ico',
|
||||
'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
|
||||
'wget_path': f'warc/{self.timestamp}',
|
||||
'warc_path': 'warc/',
|
||||
'singlefile_path': 'singlefile.html',
|
||||
'readability_path': 'readability/content.html',
|
||||
'mercury_path': 'mercury/content.html',
|
||||
'htmltotext_path': 'htmltotext.txt',
|
||||
'pdf_path': 'output.pdf',
|
||||
'screenshot_path': 'screenshot.png',
|
||||
'dom_path': 'output.html',
|
||||
'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
|
||||
'git_path': 'git/',
|
||||
'media_path': 'media/',
|
||||
'headers_path': 'headers.json',
|
||||
}
|
||||
|
||||
if self.is_static:
|
||||
static_path = f'warc/{self.timestamp}'
|
||||
canonical.update({
|
||||
'title': self.basename,
|
||||
'wget_path': static_path,
|
||||
'pdf_path': static_path,
|
||||
'screenshot_path': static_path,
|
||||
'dom_path': static_path,
|
||||
'singlefile_path': static_path,
|
||||
'readability_path': static_path,
|
||||
'mercury_path': static_path,
|
||||
'htmltotext_path': static_path,
|
||||
})
|
||||
return canonical
|
||||
|
||||
def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Get the latest output that each archive method produced"""
|
||||
from archivebox.hooks import get_extractors
|
||||
|
||||
latest: Dict[str, Any] = {}
|
||||
for archive_method in get_extractors():
|
||||
results = self.archiveresult_set.filter(extractor=archive_method)
|
||||
if status is not None:
|
||||
results = results.filter(status=status)
|
||||
results = results.filter(output__isnull=False).order_by('-start_ts')
|
||||
latest[archive_method] = results.first().output if results.exists() else None
|
||||
return latest
|
||||
|
||||
# =========================================================================
|
||||
# Serialization Methods
|
||||
# =========================================================================
|
||||
|
||||
def to_dict(self, extended: bool = False) -> Dict[str, Any]:
|
||||
"""Convert Snapshot to a dictionary (replacement for Link._asdict())"""
|
||||
from archivebox.misc.util import ts_to_date_str
|
||||
|
||||
result = {
|
||||
'TYPE': 'core.models.Snapshot',
|
||||
'id': str(self.id),
|
||||
'url': self.url,
|
||||
'timestamp': self.timestamp,
|
||||
'title': self.title,
|
||||
'tags': self.tags_str(),
|
||||
'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None,
|
||||
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
|
||||
'created_at': self.created_at.isoformat() if self.created_at else None,
|
||||
# Computed properties
|
||||
'domain': self.domain,
|
||||
'scheme': self.scheme,
|
||||
'base_url': self.base_url,
|
||||
'path': self.path,
|
||||
'basename': self.basename,
|
||||
'extension': self.extension,
|
||||
'is_static': self.is_static,
|
||||
'is_archived': self.is_archived,
|
||||
'archive_path': self.archive_path,
|
||||
'output_dir': self.output_dir,
|
||||
'link_dir': self.output_dir, # backwards compatibility alias
|
||||
'archive_size': self.archive_size,
|
||||
'bookmarked_date': self.bookmarked_date,
|
||||
'downloaded_datestr': self.downloaded_datestr,
|
||||
'num_outputs': self.num_outputs,
|
||||
'num_failures': self.num_failures,
|
||||
}
|
||||
if extended:
|
||||
result['canonical'] = self.canonical_outputs()
|
||||
return result
|
||||
|
||||
def to_json(self, indent: int = 4) -> str:
|
||||
"""Convert to JSON string"""
|
||||
return to_json(self.to_dict(extended=True), indent=indent)
|
||||
|
||||
def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
|
||||
"""Convert to CSV string"""
|
||||
data = self.to_dict()
|
||||
cols = cols or ['timestamp', 'is_archived', 'url']
|
||||
return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols)
|
||||
|
||||
def write_json_details(self, out_dir: Optional[str] = None) -> None:
|
||||
"""Write JSON index file for this snapshot to its output directory"""
|
||||
out_dir = out_dir or self.output_dir
|
||||
path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
|
||||
atomic_write(str(path), self.to_dict(extended=True))
|
||||
|
||||
def write_html_details(self, out_dir: Optional[str] = None) -> None:
|
||||
"""Write HTML detail page for this snapshot to its output directory"""
|
||||
from django.template.loader import render_to_string
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
|
||||
out_dir = out_dir or self.output_dir
|
||||
config = get_config()
|
||||
SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
|
||||
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||
|
||||
canonical = self.canonical_outputs()
|
||||
context = {
|
||||
**self.to_dict(extended=True),
|
||||
**{f'{k}_path': v for k, v in canonical.items()},
|
||||
'canonical': {f'{k}_path': v for k, v in canonical.items()},
|
||||
'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
|
||||
'url_str': htmlencode(urldecode(self.base_url)),
|
||||
'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
|
||||
'extension': self.extension or 'html',
|
||||
'tags': self.tags_str() or 'untagged',
|
||||
'size': printable_filesize(self.archive_size) if self.archive_size else 'pending',
|
||||
'status': 'archived' if self.is_archived else 'not yet archived',
|
||||
'status_color': 'success' if self.is_archived else 'danger',
|
||||
'oldest_archive_date': ts_to_date_str(self.oldest_archive_date),
|
||||
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
|
||||
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
|
||||
}
|
||||
rendered_html = render_to_string('snapshot.html', context)
|
||||
atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
|
||||
|
||||
# =========================================================================
|
||||
# Helper Methods
|
||||
# =========================================================================
|
||||
|
||||
@staticmethod
|
||||
def _ts_to_date_str(dt: Optional[datetime]) -> Optional[str]:
|
||||
return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
|
||||
|
||||
|
||||
class ArchiveResultManager(models.Manager):
|
||||
def indexable(self, sorted: bool = True):
|
||||
@@ -225,7 +750,7 @@ class ArchiveResultManager(models.Manager):
|
||||
return qs
|
||||
|
||||
|
||||
class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
class StatusChoices(models.TextChoices):
|
||||
QUEUED = 'queued', 'Queued'
|
||||
STARTED = 'started', 'Started'
|
||||
@@ -277,7 +802,7 @@ class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, M
|
||||
|
||||
@cached_property
|
||||
def snapshot_dir(self):
|
||||
return Path(self.snapshot.link_dir)
|
||||
return Path(self.snapshot.output_dir)
|
||||
|
||||
@cached_property
|
||||
def url(self):
|
||||
@@ -292,7 +817,9 @@ class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, M
|
||||
|
||||
@property
|
||||
def extractor_module(self) -> Any | None:
|
||||
return abx.as_dict(abx.pm.hook.get_EXTRACTORS()).get(self.extractor, None)
|
||||
# Hook scripts are now used instead of Python extractor modules
|
||||
# The extractor name maps to hooks in archivebox/plugins/{extractor}/
|
||||
return None
|
||||
|
||||
def output_exists(self) -> bool:
|
||||
return os.path.exists(Path(self.snapshot_dir) / self.extractor)
|
||||
@@ -315,3 +842,150 @@ class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, M
|
||||
|
||||
def save_search_index(self):
|
||||
pass
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Execute this ArchiveResult's extractor and update status.
|
||||
|
||||
Discovers and runs the hook script for self.extractor,
|
||||
updates status/output fields, queues discovered URLs, and triggers indexing.
|
||||
"""
|
||||
from django.utils import timezone
|
||||
from archivebox.hooks import discover_hooks, run_hook
|
||||
|
||||
extractor_dir = Path(self.snapshot.output_dir) / self.extractor
|
||||
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
|
||||
|
||||
# Discover hook for this extractor
|
||||
hooks = discover_hooks(f'Snapshot__{self.extractor}')
|
||||
if not hooks:
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.output = f'No hook found for: {self.extractor}'
|
||||
self.retry_at = None
|
||||
self.save()
|
||||
return
|
||||
|
||||
# Run the hook
|
||||
start_ts = timezone.now()
|
||||
result = run_hook(
|
||||
hooks[0],
|
||||
output_dir=extractor_dir,
|
||||
config_objects=config_objects,
|
||||
url=self.snapshot.url,
|
||||
)
|
||||
end_ts = timezone.now()
|
||||
|
||||
# Determine status from return code and JSON output
|
||||
output_json = result.get('output_json') or {}
|
||||
json_status = output_json.get('status')
|
||||
|
||||
if json_status == 'skipped':
|
||||
status = 'skipped'
|
||||
elif json_status == 'failed':
|
||||
status = 'failed'
|
||||
elif result['returncode'] == 0:
|
||||
status = 'succeeded'
|
||||
else:
|
||||
status = 'failed'
|
||||
|
||||
# Update self from result
|
||||
status_map = {
|
||||
'succeeded': self.StatusChoices.SUCCEEDED,
|
||||
'failed': self.StatusChoices.FAILED,
|
||||
'skipped': self.StatusChoices.SKIPPED,
|
||||
}
|
||||
self.status = status_map.get(status, self.StatusChoices.FAILED)
|
||||
self.output = output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or None
|
||||
self.start_ts = start_ts
|
||||
self.end_ts = end_ts
|
||||
self.retry_at = None
|
||||
self.save()
|
||||
|
||||
# Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
|
||||
self._queue_urls_for_crawl(extractor_dir)
|
||||
|
||||
# Trigger search indexing if succeeded
|
||||
if self.status == self.StatusChoices.SUCCEEDED:
|
||||
self.trigger_search_indexing()
|
||||
|
||||
def _queue_urls_for_crawl(self, extractor_dir: Path):
|
||||
"""
|
||||
Read urls.jsonl and queue discovered URLs for crawling.
|
||||
|
||||
Parser extractors output urls.jsonl with discovered URLs and Tags.
|
||||
- Tag records: {"type": "Tag", "name": "..."}
|
||||
- Snapshot records: {"type": "Snapshot", "url": "...", ...}
|
||||
|
||||
Tags are created in the database.
|
||||
URLs get added to the parent Crawl's queue with metadata
|
||||
(depth, via_snapshot, via_extractor) for recursive crawling.
|
||||
|
||||
Used at all depths:
|
||||
- depth=0: Initial source file (e.g., bookmarks.html) parsed for URLs
|
||||
- depth>0: Crawled pages parsed for outbound links
|
||||
"""
|
||||
import json
|
||||
|
||||
if not self.snapshot.crawl:
|
||||
return
|
||||
|
||||
urls_file = extractor_dir / 'urls.jsonl'
|
||||
if not urls_file.exists():
|
||||
return
|
||||
|
||||
urls_added = 0
|
||||
tags_created = 0
|
||||
with open(urls_file, 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
record_type = entry.get('type', 'Snapshot')
|
||||
|
||||
# Handle Tag records
|
||||
if record_type == 'Tag':
|
||||
tag_name = entry.get('name')
|
||||
if tag_name:
|
||||
Tag.objects.get_or_create(name=tag_name)
|
||||
tags_created += 1
|
||||
continue
|
||||
|
||||
# Handle Snapshot records (or records without type)
|
||||
if not entry.get('url'):
|
||||
continue
|
||||
|
||||
# Add crawl metadata
|
||||
entry['depth'] = self.snapshot.depth + 1
|
||||
entry['via_snapshot'] = str(self.snapshot.id)
|
||||
entry['via_extractor'] = self.extractor
|
||||
|
||||
if self.snapshot.crawl.add_url(entry):
|
||||
urls_added += 1
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if urls_added > 0:
|
||||
self.snapshot.crawl.create_snapshots_from_urls()
|
||||
|
||||
def trigger_search_indexing(self):
|
||||
"""Run any ArchiveResult__index hooks to update search indexes."""
|
||||
from archivebox.hooks import discover_hooks, run_hook
|
||||
|
||||
# Pass config objects in priority order (later overrides earlier)
|
||||
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
|
||||
|
||||
for hook in discover_hooks('ArchiveResult__index'):
|
||||
run_hook(
|
||||
hook,
|
||||
output_dir=self.output_dir,
|
||||
config_objects=config_objects,
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
extractor=self.extractor,
|
||||
)
|
||||
|
||||
@property
|
||||
def output_dir(self) -> Path:
|
||||
"""Get the output directory for this extractor's results."""
|
||||
return Path(self.snapshot.output_dir) / self.extractor
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
import os
|
||||
import sys
|
||||
@@ -8,17 +8,16 @@ from pathlib import Path
|
||||
|
||||
from django.utils.crypto import get_random_string
|
||||
|
||||
import abx
|
||||
import archivebox
|
||||
|
||||
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG # noqa
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG # noqa
|
||||
|
||||
|
||||
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
|
||||
IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
|
||||
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
|
||||
IS_GETTING_VERSION_OR_HELP = 'version' in sys.argv or 'help' in sys.argv or '--version' in sys.argv or '--help' in sys.argv
|
||||
IS_MIGRATING = "makemigrations" in sys.argv[:3] or "migrate" in sys.argv[:3]
|
||||
IS_TESTING = "test" in sys.argv[:3] or "PYTEST_CURRENT_TEST" in os.environ
|
||||
IS_SHELL = "shell" in sys.argv[:3] or "shell_plus" in sys.argv[:3]
|
||||
IS_GETTING_VERSION_OR_HELP = "version" in sys.argv or "help" in sys.argv or "--version" in sys.argv or "--help" in sys.argv
|
||||
|
||||
################################################################################
|
||||
### ArchiveBox Plugin Settings
|
||||
@@ -31,71 +30,61 @@ LOADED_PLUGINS = archivebox.LOADED_PLUGINS
|
||||
### Django Core Settings
|
||||
################################################################################
|
||||
|
||||
WSGI_APPLICATION = 'core.wsgi.application'
|
||||
WSGI_APPLICATION = "core.wsgi.application"
|
||||
ASGI_APPLICATION = "core.asgi.application"
|
||||
ROOT_URLCONF = 'core.urls'
|
||||
ROOT_URLCONF = "core.urls"
|
||||
|
||||
LOGIN_URL = '/accounts/login/'
|
||||
LOGOUT_REDIRECT_URL = os.environ.get('LOGOUT_REDIRECT_URL', '/')
|
||||
LOGIN_URL = "/accounts/login/"
|
||||
LOGOUT_REDIRECT_URL = os.environ.get("LOGOUT_REDIRECT_URL", "/")
|
||||
|
||||
PASSWORD_RESET_URL = '/accounts/password_reset/'
|
||||
PASSWORD_RESET_URL = "/accounts/password_reset/"
|
||||
APPEND_SLASH = True
|
||||
|
||||
DEBUG = SHELL_CONFIG.DEBUG or ('--debug' in sys.argv)
|
||||
DEBUG = SHELL_CONFIG.DEBUG or ("--debug" in sys.argv)
|
||||
|
||||
|
||||
INSTALLED_APPS = [
|
||||
'daphne',
|
||||
|
||||
"daphne",
|
||||
# Django default apps
|
||||
'django.contrib.auth',
|
||||
'django.contrib.contenttypes',
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'django.contrib.admin',
|
||||
|
||||
"django.contrib.auth",
|
||||
"django.contrib.contenttypes",
|
||||
"django.contrib.sessions",
|
||||
"django.contrib.messages",
|
||||
"django.contrib.staticfiles",
|
||||
"django.contrib.admin",
|
||||
# 3rd-party apps from PyPI
|
||||
'signal_webhooks', # handles REST API outbound webhooks https://github.com/MrThearMan/django-signal-webhooks
|
||||
'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
|
||||
|
||||
"signal_webhooks", # handles REST API outbound webhooks https://github.com/MrThearMan/django-signal-webhooks
|
||||
"django_object_actions", # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
|
||||
# Our ArchiveBox-provided apps
|
||||
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
|
||||
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
|
||||
'workers', # handles starting and managing background workers and processes (orchestrators and actors)
|
||||
'crawls', # handles Seed, Crawl, and CrawlSchedule models and management
|
||||
'personas', # handles Persona and session management
|
||||
'core', # core django model with Snapshot, ArchiveResult, etc.
|
||||
'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
||||
|
||||
# ArchiveBox plugins
|
||||
*abx.as_list(abx.pm.hook.get_INSTALLED_APPS()), # all plugin django-apps found in archivebox/plugins_* and data/user_plugins,
|
||||
|
||||
"config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
|
||||
"machine", # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
|
||||
"workers", # handles starting and managing background workers and processes (orchestrators and actors)
|
||||
"crawls", # handles Seed, Crawl, and CrawlSchedule models and management
|
||||
"personas", # handles Persona and session management
|
||||
"core", # core django model with Snapshot, ArchiveResult, etc.
|
||||
"api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
||||
# ArchiveBox plugins (hook-based plugins no longer add Django apps)
|
||||
# Use hooks.py discover_hooks() for plugin functionality
|
||||
# 3rd-party apps from PyPI that need to be loaded last
|
||||
'admin_data_views', # handles rendering some convenient automatic read-only views of data in Django admin
|
||||
'django_extensions', # provides Django Debug Toolbar (and other non-debug helpers)
|
||||
'django_huey', # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey
|
||||
'bx_django_utils', # needed for huey_monitor https://github.com/boxine/bx_django_utils
|
||||
'huey_monitor', # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor
|
||||
|
||||
# load plugins last so all other apps are already .ready() when we call plugins.ready()
|
||||
'abx',
|
||||
"admin_data_views", # handles rendering some convenient automatic read-only views of data in Django admin
|
||||
"django_extensions", # provides Django Debug Toolbar (and other non-debug helpers)
|
||||
"django_huey", # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey
|
||||
"bx_django_utils", # needed for huey_monitor https://github.com/boxine/bx_django_utils
|
||||
"huey_monitor", # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
MIDDLEWARE = [
|
||||
'core.middleware.TimezoneMiddleware',
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
'django.middleware.common.CommonMiddleware',
|
||||
'django.middleware.csrf.CsrfViewMiddleware',
|
||||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||
'core.middleware.ReverseProxyAuthMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
'core.middleware.CacheControlMiddleware',
|
||||
*abx.as_list(abx.pm.hook.get_MIDDLEWARES()),
|
||||
"core.middleware.TimezoneMiddleware",
|
||||
"django.middleware.security.SecurityMiddleware",
|
||||
"django.contrib.sessions.middleware.SessionMiddleware",
|
||||
"django.middleware.common.CommonMiddleware",
|
||||
"django.middleware.csrf.CsrfViewMiddleware",
|
||||
"django.contrib.auth.middleware.AuthenticationMiddleware",
|
||||
"core.middleware.ReverseProxyAuthMiddleware",
|
||||
"django.contrib.messages.middleware.MessageMiddleware",
|
||||
"core.middleware.CacheControlMiddleware",
|
||||
# Additional middlewares from plugins (if any)
|
||||
]
|
||||
|
||||
|
||||
@@ -106,9 +95,9 @@ MIDDLEWARE = [
|
||||
# AUTH_USER_MODEL = 'auth.User' # cannot be easily changed unfortunately
|
||||
|
||||
AUTHENTICATION_BACKENDS = [
|
||||
'django.contrib.auth.backends.RemoteUserBackend',
|
||||
'django.contrib.auth.backends.ModelBackend',
|
||||
*abx.as_list(abx.pm.hook.get_AUTHENTICATION_BACKENDS()),
|
||||
"django.contrib.auth.backends.RemoteUserBackend",
|
||||
"django.contrib.auth.backends.ModelBackend",
|
||||
# Additional auth backends (e.g., LDAP) configured via settings
|
||||
]
|
||||
|
||||
|
||||
@@ -120,25 +109,25 @@ AUTHENTICATION_BACKENDS = [
|
||||
# AUTH_LDAP_BIND_PASSWORD = LDAP_CONFIG.LDAP_BIND_PASSWORD
|
||||
# AUTH_LDAP_USER_ATTR_MAP = LDAP_CONFIG.LDAP_USER_ATTR_MAP
|
||||
# AUTH_LDAP_USER_SEARCH = LDAP_CONFIG.AUTH_LDAP_USER_SEARCH
|
||||
|
||||
|
||||
# AUTHENTICATION_BACKENDS = LDAP_CONFIG.AUTHENTICATION_BACKENDS
|
||||
|
||||
################################################################################
|
||||
### Staticfile and Template Settings
|
||||
################################################################################
|
||||
|
||||
STATIC_URL = '/static/'
|
||||
TEMPLATES_DIR_NAME = 'templates'
|
||||
STATIC_URL = "/static/"
|
||||
TEMPLATES_DIR_NAME = "templates"
|
||||
CUSTOM_TEMPLATES_ENABLED = os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK)
|
||||
STATICFILES_DIRS = [
|
||||
*([str(CONSTANTS.CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_ENABLED else []),
|
||||
*([str(CONSTANTS.CUSTOM_TEMPLATES_DIR / "static")] if CUSTOM_TEMPLATES_ENABLED else []),
|
||||
# *[
|
||||
# str(plugin_dir / 'static')
|
||||
# for plugin_dir in PLUGIN_DIRS.values()
|
||||
# if (plugin_dir / 'static').is_dir()
|
||||
# ],
|
||||
*abx.as_list(abx.pm.hook.get_STATICFILES_DIRS()),
|
||||
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'static'),
|
||||
# Additional static file dirs from plugins
|
||||
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / "static"),
|
||||
]
|
||||
|
||||
TEMPLATE_DIRS = [
|
||||
@@ -148,23 +137,23 @@ TEMPLATE_DIRS = [
|
||||
# for plugin_dir in PLUGIN_DIRS.values()
|
||||
# if (plugin_dir / 'templates').is_dir()
|
||||
# ],
|
||||
*abx.as_list(abx.pm.hook.get_TEMPLATE_DIRS()),
|
||||
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'core'),
|
||||
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'admin'),
|
||||
# Additional template dirs from plugins
|
||||
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / "core"),
|
||||
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / "admin"),
|
||||
str(PACKAGE_DIR / TEMPLATES_DIR_NAME),
|
||||
]
|
||||
|
||||
TEMPLATES = [
|
||||
{
|
||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||
'DIRS': TEMPLATE_DIRS,
|
||||
'APP_DIRS': True,
|
||||
'OPTIONS': {
|
||||
'context_processors': [
|
||||
'django.template.context_processors.debug',
|
||||
'django.template.context_processors.request',
|
||||
'django.contrib.auth.context_processors.auth',
|
||||
'django.contrib.messages.context_processors.messages',
|
||||
"BACKEND": "django.template.backends.django.DjangoTemplates",
|
||||
"DIRS": TEMPLATE_DIRS,
|
||||
"APP_DIRS": True,
|
||||
"OPTIONS": {
|
||||
"context_processors": [
|
||||
"django.template.context_processors.debug",
|
||||
"django.template.context_processors.request",
|
||||
"django.contrib.auth.context_processors.auth",
|
||||
"django.contrib.messages.context_processors.messages",
|
||||
],
|
||||
},
|
||||
},
|
||||
@@ -221,10 +210,10 @@ DATABASES = {
|
||||
# **SQLITE_CONNECTION_OPTIONS,
|
||||
# },
|
||||
}
|
||||
MIGRATION_MODULES = {'signal_webhooks': None}
|
||||
MIGRATION_MODULES = {"signal_webhooks": None}
|
||||
|
||||
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
|
||||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
|
||||
|
||||
HUEY = {
|
||||
"huey_class": "huey.SqliteHuey",
|
||||
@@ -254,7 +243,7 @@ DJANGO_HUEY = {
|
||||
"queues": {
|
||||
HUEY["name"]: HUEY.copy(),
|
||||
# more registered here at plugin import-time by BaseQueue.register()
|
||||
**abx.as_dict(abx.pm.hook.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=CONSTANTS.QUEUE_DATABASE_FILENAME)),
|
||||
# Additional huey queues configured via settings
|
||||
},
|
||||
}
|
||||
|
||||
@@ -274,12 +263,12 @@ class HueyDBRouter:
|
||||
def db_for_read(self, model, **hints):
|
||||
if model._meta.app_label in self.route_app_labels:
|
||||
return self.db_name
|
||||
return 'default'
|
||||
return "default"
|
||||
|
||||
def db_for_write(self, model, **hints):
|
||||
if model._meta.app_label in self.route_app_labels:
|
||||
return self.db_name
|
||||
return 'default'
|
||||
return "default"
|
||||
|
||||
def allow_relation(self, obj1, obj2, **hints):
|
||||
if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels:
|
||||
@@ -291,6 +280,7 @@ class HueyDBRouter:
|
||||
return db == self.db_name
|
||||
return db == "default"
|
||||
|
||||
|
||||
# class FilestoreDBRouter:
|
||||
# """
|
||||
# A router to store all the File models in the filestore.sqlite3 database.
|
||||
@@ -321,16 +311,16 @@ class HueyDBRouter:
|
||||
# return db == self.db_name
|
||||
# return db == "default"
|
||||
|
||||
DATABASE_ROUTERS = ['core.settings.HueyDBRouter']
|
||||
DATABASE_ROUTERS = ["core.settings.HueyDBRouter"]
|
||||
|
||||
CACHES = {
|
||||
'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'},
|
||||
"default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"},
|
||||
# 'sqlite': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'},
|
||||
# 'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'},
|
||||
# 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'},
|
||||
}
|
||||
|
||||
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
|
||||
EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend"
|
||||
|
||||
|
||||
STORAGES = {
|
||||
@@ -363,32 +353,28 @@ STORAGES = {
|
||||
# },
|
||||
}
|
||||
|
||||
CHANNEL_LAYERS = {
|
||||
"default": {
|
||||
"BACKEND": "channels.layers.InMemoryChannelLayer"
|
||||
}
|
||||
}
|
||||
CHANNEL_LAYERS = {"default": {"BACKEND": "channels.layers.InMemoryChannelLayer"}}
|
||||
|
||||
################################################################################
|
||||
### Security Settings
|
||||
################################################################################
|
||||
|
||||
SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
|
||||
SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_")
|
||||
|
||||
ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(',')
|
||||
CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
|
||||
ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(",")
|
||||
CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(",")))
|
||||
|
||||
# automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
|
||||
# but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS
|
||||
for hostname in ALLOWED_HOSTS:
|
||||
https_endpoint = f'https://{hostname}'
|
||||
if hostname != '*' and https_endpoint not in CSRF_TRUSTED_ORIGINS:
|
||||
print(f'[!] WARNING: {https_endpoint} from ALLOWED_HOSTS should be added to CSRF_TRUSTED_ORIGINS')
|
||||
https_endpoint = f"https://{hostname}"
|
||||
if hostname != "*" and https_endpoint not in CSRF_TRUSTED_ORIGINS:
|
||||
print(f"[!] WARNING: {https_endpoint} from ALLOWED_HOSTS should be added to CSRF_TRUSTED_ORIGINS")
|
||||
CSRF_TRUSTED_ORIGINS.append(https_endpoint)
|
||||
|
||||
SECURE_BROWSER_XSS_FILTER = True
|
||||
SECURE_CONTENT_TYPE_NOSNIFF = True
|
||||
SECURE_REFERRER_POLICY = 'strict-origin-when-cross-origin'
|
||||
SECURE_REFERRER_POLICY = "strict-origin-when-cross-origin"
|
||||
|
||||
CSRF_COOKIE_SECURE = False
|
||||
SESSION_COOKIE_SECURE = False
|
||||
@@ -401,10 +387,10 @@ SESSION_SAVE_EVERY_REQUEST = False
|
||||
SESSION_ENGINE = "django.contrib.sessions.backends.db"
|
||||
|
||||
AUTH_PASSWORD_VALIDATORS = [
|
||||
{'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'},
|
||||
{'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'},
|
||||
{'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator'},
|
||||
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
|
||||
{"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator"},
|
||||
{"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator"},
|
||||
{"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator"},
|
||||
{"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator"},
|
||||
]
|
||||
|
||||
DATA_UPLOAD_MAX_NUMBER_FIELDS = None
|
||||
@@ -414,29 +400,29 @@ DATA_UPLOAD_MAX_MEMORY_SIZE = 26_214_400 # 25MB
|
||||
### Shell Settings
|
||||
################################################################################
|
||||
|
||||
SHELL_PLUS = 'ipython'
|
||||
SHELL_PLUS = "ipython"
|
||||
SHELL_PLUS_PRINT_SQL = False
|
||||
IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner']
|
||||
IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell'
|
||||
IPYTHON_ARGUMENTS = ["--no-confirm-exit", "--no-banner"]
|
||||
IPYTHON_KERNEL_DISPLAY_NAME = "ArchiveBox Django Shell"
|
||||
if IS_SHELL:
|
||||
os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'misc' / 'shell_welcome_message.py')
|
||||
os.environ["PYTHONSTARTUP"] = str(PACKAGE_DIR / "misc" / "shell_welcome_message.py")
|
||||
|
||||
|
||||
################################################################################
|
||||
### Internationalization & Localization Settings
|
||||
################################################################################
|
||||
|
||||
LANGUAGE_CODE = 'en-us'
|
||||
LANGUAGE_CODE = "en-us"
|
||||
USE_I18N = True
|
||||
USE_TZ = True
|
||||
DATETIME_FORMAT = 'Y-m-d h:i:s A'
|
||||
SHORT_DATETIME_FORMAT = 'Y-m-d h:i:s A'
|
||||
TIME_ZONE = CONSTANTS.TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent
|
||||
DATETIME_FORMAT = "Y-m-d h:i:s A"
|
||||
SHORT_DATETIME_FORMAT = "Y-m-d h:i:s A"
|
||||
TIME_ZONE = CONSTANTS.TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent
|
||||
|
||||
|
||||
from django.conf.locale.en import formats as en_formats # type: ignore
|
||||
from django.conf.locale.en import formats as en_formats # type: ignore
|
||||
|
||||
en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format
|
||||
en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format
|
||||
en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
|
||||
|
||||
|
||||
@@ -455,7 +441,7 @@ LOGGING = SETTINGS_LOGGING
|
||||
################################################################################
|
||||
|
||||
# Add default webhook configuration to the User model
|
||||
SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
|
||||
SIGNAL_WEBHOOKS_CUSTOM_MODEL = "api.models.OutboundWebhook"
|
||||
SIGNAL_WEBHOOKS = {
|
||||
"HOOKS": {
|
||||
# ... is a special sigil value that means "use the default autogenerated hooks"
|
||||
@@ -524,7 +510,7 @@ ADMIN_DATA_VIEWS = {
|
||||
"name": "log",
|
||||
},
|
||||
},
|
||||
*abx.as_list(abx.pm.hook.get_ADMIN_DATA_VIEWS_URLS()),
|
||||
# Additional admin data views from plugins
|
||||
],
|
||||
}
|
||||
|
||||
@@ -535,44 +521,45 @@ ADMIN_DATA_VIEWS = {
|
||||
|
||||
# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
|
||||
DEBUG_TOOLBAR = False
|
||||
DEBUG_TOOLBAR = DEBUG_TOOLBAR and DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
|
||||
DEBUG_TOOLBAR = DEBUG_TOOLBAR and DEBUG and ("--nothreading" in sys.argv) and ("--reload" not in sys.argv)
|
||||
if DEBUG_TOOLBAR:
|
||||
try:
|
||||
import debug_toolbar # noqa
|
||||
import debug_toolbar # noqa
|
||||
|
||||
DEBUG_TOOLBAR = True
|
||||
except ImportError:
|
||||
DEBUG_TOOLBAR = False
|
||||
|
||||
if DEBUG_TOOLBAR:
|
||||
INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
|
||||
INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
|
||||
INSTALLED_APPS = [*INSTALLED_APPS, "debug_toolbar"]
|
||||
INTERNAL_IPS = ["0.0.0.0", "127.0.0.1", "*"]
|
||||
DEBUG_TOOLBAR_CONFIG = {
|
||||
"SHOW_TOOLBAR_CALLBACK": lambda request: True,
|
||||
"RENDER_PANELS": True,
|
||||
}
|
||||
DEBUG_TOOLBAR_PANELS = [
|
||||
'debug_toolbar.panels.history.HistoryPanel',
|
||||
'debug_toolbar.panels.versions.VersionsPanel',
|
||||
'debug_toolbar.panels.timer.TimerPanel',
|
||||
'debug_toolbar.panels.settings.SettingsPanel',
|
||||
'debug_toolbar.panels.headers.HeadersPanel',
|
||||
'debug_toolbar.panels.request.RequestPanel',
|
||||
'debug_toolbar.panels.sql.SQLPanel',
|
||||
'debug_toolbar.panels.staticfiles.StaticFilesPanel',
|
||||
"debug_toolbar.panels.history.HistoryPanel",
|
||||
"debug_toolbar.panels.versions.VersionsPanel",
|
||||
"debug_toolbar.panels.timer.TimerPanel",
|
||||
"debug_toolbar.panels.settings.SettingsPanel",
|
||||
"debug_toolbar.panels.headers.HeadersPanel",
|
||||
"debug_toolbar.panels.request.RequestPanel",
|
||||
"debug_toolbar.panels.sql.SQLPanel",
|
||||
"debug_toolbar.panels.staticfiles.StaticFilesPanel",
|
||||
# 'debug_toolbar.panels.templates.TemplatesPanel',
|
||||
'debug_toolbar.panels.cache.CachePanel',
|
||||
'debug_toolbar.panels.signals.SignalsPanel',
|
||||
'debug_toolbar.panels.logging.LoggingPanel',
|
||||
'debug_toolbar.panels.redirects.RedirectsPanel',
|
||||
'debug_toolbar.panels.profiling.ProfilingPanel',
|
||||
'djdt_flamegraph.FlamegraphPanel',
|
||||
"debug_toolbar.panels.cache.CachePanel",
|
||||
"debug_toolbar.panels.signals.SignalsPanel",
|
||||
"debug_toolbar.panels.logging.LoggingPanel",
|
||||
"debug_toolbar.panels.redirects.RedirectsPanel",
|
||||
"debug_toolbar.panels.profiling.ProfilingPanel",
|
||||
"djdt_flamegraph.FlamegraphPanel",
|
||||
]
|
||||
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
|
||||
MIDDLEWARE = [*MIDDLEWARE, "debug_toolbar.middleware.DebugToolbarMiddleware"]
|
||||
|
||||
if DEBUG:
|
||||
from django_autotyping.typing import AutotypingSettingsDict
|
||||
|
||||
INSTALLED_APPS += ['django_autotyping']
|
||||
INSTALLED_APPS += ["django_autotyping"]
|
||||
AUTOTYPING: AutotypingSettingsDict = {
|
||||
"STUBS_GENERATION": {
|
||||
"LOCAL_STUBS_DIR": PACKAGE_DIR / "typings",
|
||||
|
||||
@@ -79,15 +79,16 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
print(f'{self}.on_started() ↳ snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)')
|
||||
print(f'{self}.on_started() ↳ snapshot.run()')
|
||||
# lock the snapshot while we create the pending archiveresults
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
|
||||
)
|
||||
# create the pending archiveresults
|
||||
self.snapshot.create_pending_archiveresults()
|
||||
|
||||
# unlock the snapshot after we're done creating the pending archiveresults + set status = started
|
||||
|
||||
# Run the snapshot - creates pending archiveresults for all enabled extractors
|
||||
self.snapshot.run()
|
||||
|
||||
# unlock the snapshot after we're done + set status = started
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=5), # wait 5s before checking it again
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
@@ -135,19 +136,22 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
|
||||
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
|
||||
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
|
||||
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
|
||||
|
||||
# Tick Event
|
||||
# Tick Event - transitions based on conditions
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(succeeded, cond='is_succeeded') |
|
||||
started.to(failed, cond='is_failed') |
|
||||
started.to(skipped, cond='is_skipped') |
|
||||
started.to(backoff, cond='is_backoff') |
|
||||
backoff.to.itself(unless='can_start') |
|
||||
backoff.to(started, cond='can_start') |
|
||||
backoff.to(succeeded, cond='is_succeeded') |
|
||||
backoff.to(failed, cond='is_failed')
|
||||
backoff.to(failed, cond='is_failed') |
|
||||
backoff.to(skipped, cond='is_skipped')
|
||||
)
|
||||
|
||||
def __init__(self, archiveresult, *args, **kwargs):
|
||||
@@ -167,22 +171,32 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
return can_start
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
if self.archiveresult.output and 'err' not in self.archiveresult.output.lower():
|
||||
return True
|
||||
return False
|
||||
"""Check if extraction succeeded (status was set by run_extractor())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
if self.archiveresult.output and 'err' in self.archiveresult.output.lower():
|
||||
return True
|
||||
return False
|
||||
"""Check if extraction failed (status was set by run_extractor())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
|
||||
|
||||
def is_skipped(self) -> bool:
|
||||
"""Check if extraction was skipped (status was set by run_extractor())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
|
||||
|
||||
def is_backoff(self) -> bool:
|
||||
if self.archiveresult.output is None:
|
||||
return True
|
||||
return False
|
||||
"""Check if we should backoff and retry later."""
|
||||
# Backoff if status is still started (extractor didn't complete) and output is None
|
||||
return (
|
||||
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
|
||||
self.archiveresult.output is None
|
||||
)
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
return self.is_failed() or self.is_succeeded()
|
||||
"""Check if extraction has completed (success, failure, or skipped)."""
|
||||
return self.archiveresult.status in (
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
)
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
@@ -195,27 +209,28 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
print(f'{self}.on_started() ↳ archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)')
|
||||
# lock the object for the next 30sec
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=30),
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
start_ts=timezone.now(),
|
||||
) # lock the obj for the next ~30s to limit racing with other workers
|
||||
|
||||
# create the output directory and fork the new extractor job subprocess
|
||||
self.archiveresult.create_output_dir()
|
||||
# self.archiveresult.extract(background=True)
|
||||
print(f'{self}.on_started() ↳ archiveresult.start_ts + run_extractor()')
|
||||
|
||||
# mark the object as started
|
||||
# Lock the object and mark start time
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # retry it again in 30s if it fails
|
||||
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for extractor
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
start_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# simulate slow running extractor that completes after 2 seconds
|
||||
time.sleep(2)
|
||||
self.archiveresult.update_for_workers(output='completed')
|
||||
# Run the extractor - this updates status, output, timestamps, etc.
|
||||
self.archiveresult.run()
|
||||
|
||||
# Save the updated result
|
||||
self.archiveresult.save()
|
||||
|
||||
# Log the result
|
||||
if self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
|
||||
print(f'{self} ✅ extractor succeeded: {self.archiveresult.output[:50] if self.archiveresult.output else ""}...')
|
||||
elif self.archiveresult.status == ArchiveResult.StatusChoices.FAILED:
|
||||
print(f'{self} ❌ extractor failed: {self.archiveresult.output[:100] if self.archiveresult.output else ""}...')
|
||||
elif self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED:
|
||||
print(f'{self} ⏭️ extractor skipped: {self.archiveresult.output[:50] if self.archiveresult.output else ""}')
|
||||
|
||||
@backoff.enter
|
||||
def enter_backoff(self):
|
||||
@@ -246,7 +261,15 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
end_ts=timezone.now(),
|
||||
# **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
|
||||
)
|
||||
|
||||
@skipped.enter
|
||||
def enter_skipped(self):
|
||||
print(f'{self}.on_skipped() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SKIPPED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
def after_transition(self, event: str, source: State, target: State):
|
||||
|
||||
@@ -23,8 +23,9 @@ from admin_data_views.typing import TableContext, ItemContext
|
||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||
|
||||
import archivebox
|
||||
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
|
||||
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION, SAVE_ARCHIVE_DOT_ORG
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
|
||||
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
|
||||
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
|
||||
from archivebox.misc.serve_static import serve_static_with_byterange_support
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
@@ -101,7 +102,7 @@ class SnapshotView(View):
|
||||
|
||||
|
||||
# iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
|
||||
snap_dir = Path(snapshot.link_dir)
|
||||
snap_dir = Path(snapshot.output_dir)
|
||||
if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
|
||||
return {}
|
||||
|
||||
@@ -131,9 +132,7 @@ class SnapshotView(View):
|
||||
best_result = archiveresults[result_type]
|
||||
break
|
||||
|
||||
link = snapshot.as_link()
|
||||
|
||||
link_info = link._asdict(extended=True)
|
||||
snapshot_info = snapshot.to_dict(extended=True)
|
||||
|
||||
try:
|
||||
warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name
|
||||
@@ -141,24 +140,23 @@ class SnapshotView(View):
|
||||
warc_path = 'warc/'
|
||||
|
||||
context = {
|
||||
**link_info,
|
||||
**link_info['canonical'],
|
||||
**snapshot_info,
|
||||
**snapshot_info.get('canonical', {}),
|
||||
'title': htmlencode(
|
||||
link.title
|
||||
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
|
||||
snapshot.title
|
||||
or (snapshot.base_url if snapshot.is_archived else TITLE_LOADING_MSG)
|
||||
),
|
||||
'extension': link.extension or 'html',
|
||||
'tags': link.tags or 'untagged',
|
||||
'size': printable_filesize(link.archive_size) if link.archive_size else 'pending',
|
||||
'status': 'archived' if link.is_archived else 'not yet archived',
|
||||
'status_color': 'success' if link.is_archived else 'danger',
|
||||
'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
|
||||
'extension': snapshot.extension or 'html',
|
||||
'tags': snapshot.tags_str() or 'untagged',
|
||||
'size': printable_filesize(snapshot.archive_size) if snapshot.archive_size else 'pending',
|
||||
'status': 'archived' if snapshot.is_archived else 'not yet archived',
|
||||
'status_color': 'success' if snapshot.is_archived else 'danger',
|
||||
'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
|
||||
'warc_path': warc_path,
|
||||
'SAVE_ARCHIVE_DOT_ORG': archivebox.pm.hook.get_FLAT_CONFIG().SAVE_ARCHIVE_DOT_ORG,
|
||||
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
|
||||
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
|
||||
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
|
||||
'best_result': best_result,
|
||||
# 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234',
|
||||
}
|
||||
return render(template_name='core/snapshot_live.html', request=request, context=context)
|
||||
|
||||
@@ -190,7 +188,7 @@ class SnapshotView(View):
|
||||
response = self.render_live_index(request, snapshot)
|
||||
else:
|
||||
response = serve_static_with_byterange_support(
|
||||
request, archivefile, document_root=snapshot.link_dir, show_indexes=True,
|
||||
request, archivefile, document_root=snapshot.output_dir, show_indexes=True,
|
||||
)
|
||||
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
|
||||
return response
|
||||
@@ -516,7 +514,7 @@ class HealthCheckView(View):
|
||||
|
||||
|
||||
def find_config_section(key: str) -> str:
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
CONFIGS = get_all_configs()
|
||||
|
||||
if key in CONSTANTS_CONFIG:
|
||||
return 'CONSTANT'
|
||||
@@ -527,7 +525,7 @@ def find_config_section(key: str) -> str:
|
||||
return section
|
||||
|
||||
def find_config_default(key: str) -> str:
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
CONFIGS = get_all_configs()
|
||||
|
||||
if key in CONSTANTS_CONFIG:
|
||||
return str(CONSTANTS_CONFIG[key])
|
||||
@@ -550,7 +548,7 @@ def find_config_default(key: str) -> str:
|
||||
return default_val
|
||||
|
||||
def find_config_type(key: str) -> str:
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
CONFIGS = get_all_configs()
|
||||
|
||||
for config in CONFIGS.values():
|
||||
if hasattr(config, key):
|
||||
@@ -569,7 +567,7 @@ def key_is_safe(key: str) -> bool:
|
||||
|
||||
@render_with_table_view
|
||||
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
CONFIGS = get_all_configs()
|
||||
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
@@ -611,8 +609,8 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
@render_with_item_view
|
||||
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
||||
CONFIGS = get_all_configs()
|
||||
FLAT_CONFIG = get_flat_config()
|
||||
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user