mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
way better plugin hooks system wip
This commit is contained in:
@@ -1,33 +1,30 @@
|
||||
"""
|
||||
WSGI config for archivebox project.
|
||||
ASGI config for archivebox project.
|
||||
|
||||
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||
It exposes the ASGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
|
||||
https://docs.djangoproject.com/en/stable/howto/deployment/asgi/
|
||||
"""
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django(in_memory_db=False, check_db=True)
|
||||
|
||||
|
||||
# from channels.auth import AuthMiddlewareStack
|
||||
# from channels.security.websocket import AllowedHostsOriginValidator
|
||||
from channels.routing import ProtocolTypeRouter # , URLRouter
|
||||
from django.core.asgi import get_asgi_application
|
||||
|
||||
# Standard Django ASGI application (no websockets/channels needed)
|
||||
application = get_asgi_application()
|
||||
|
||||
# If websocket support is needed later, install channels and use:
|
||||
# from channels.routing import ProtocolTypeRouter, URLRouter
|
||||
# from channels.auth import AuthMiddlewareStack
|
||||
# from channels.security.websocket import AllowedHostsOriginValidator
|
||||
# from core.routing import websocket_urlpatterns
|
||||
|
||||
|
||||
django_asgi_app = get_asgi_application()
|
||||
|
||||
application = ProtocolTypeRouter(
|
||||
{
|
||||
"http": django_asgi_app,
|
||||
# only if we need websocket support later:
|
||||
# "websocket": AllowedHostsOriginValidator(
|
||||
# AuthMiddlewareStack(URLRouter(websocket_urlpatterns))
|
||||
# ),
|
||||
}
|
||||
)
|
||||
#
|
||||
# application = ProtocolTypeRouter({
|
||||
# "http": get_asgi_application(),
|
||||
# "websocket": AllowedHostsOriginValidator(
|
||||
# AuthMiddlewareStack(URLRouter(websocket_urlpatterns))
|
||||
# ),
|
||||
# })
|
||||
|
||||
@@ -69,7 +69,7 @@ class Migration(migrations.Migration):
|
||||
model_name='archiveresult',
|
||||
name='binary',
|
||||
field=models.ForeignKey(
|
||||
'machine.InstalledBinary',
|
||||
'machine.Binary',
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
blank=True,
|
||||
|
||||
27
archivebox/core/migrations/0031_snapshot_parent_snapshot.py
Normal file
27
archivebox/core/migrations/0031_snapshot_parent_snapshot.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# Generated by Django 6.0 on 2025-12-27
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0030_migrate_output_field'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='parent_snapshot',
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
db_index=True,
|
||||
help_text='Parent snapshot that discovered this URL (for recursive crawling)',
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name='child_snapshots',
|
||||
to='core.snapshot'
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,58 @@
|
||||
# Generated by Django 6.0 on 2025-12-28 05:12
|
||||
|
||||
import django.db.models.deletion
|
||||
import uuid
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0031_snapshot_parent_snapshot'),
|
||||
('crawls', '0004_alter_crawl_output_dir'),
|
||||
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='binary',
|
||||
field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,29 @@
|
||||
# Generated by Django 6.0 on 2025-12-28
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0032_alter_archiveresult_binary_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='archiveresult',
|
||||
old_name='extractor',
|
||||
new_name='plugin',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='hook_name',
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
default='',
|
||||
max_length=255,
|
||||
db_index=True,
|
||||
help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
|
||||
),
|
||||
),
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
@@ -57,7 +57,7 @@ INSTALLED_APPS = [
|
||||
"django_object_actions", # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
|
||||
# Our ArchiveBox-provided apps
|
||||
"config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
|
||||
"machine", # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
|
||||
"machine", # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
|
||||
"workers", # handles starting and managing background workers and processes (orchestrators and actors)
|
||||
"crawls", # handles Crawl and CrawlSchedule models and management
|
||||
"personas", # handles Persona and session management
|
||||
|
||||
@@ -64,16 +64,10 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||
if self.snapshot.pending_archiveresults().exists():
|
||||
return False
|
||||
|
||||
# Check for background hooks that are still running
|
||||
started_results = self.snapshot.archiveresult_set.filter(
|
||||
status=ArchiveResult.StatusChoices.STARTED
|
||||
)
|
||||
for result in started_results:
|
||||
if not result.check_background_completed():
|
||||
return False # Still running
|
||||
|
||||
# Completed - finalize it
|
||||
result.finalize_background_hook()
|
||||
# Don't wait for background hooks - they'll be cleaned up on entering sealed state
|
||||
# Background hooks in STARTED state are excluded by pending_archiveresults()
|
||||
# (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
|
||||
# we can transition to sealed and cleanup() will kill the background hooks
|
||||
|
||||
# otherwise archiveresults exist and are all finished, so it's finished
|
||||
return True
|
||||
@@ -108,6 +102,9 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
# Clean up background hooks
|
||||
self.snapshot.cleanup()
|
||||
|
||||
# Suppressed: state transition logs
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=None,
|
||||
|
||||
Reference in New Issue
Block a user