way better plugin hooks system wip

This commit is contained in:
Nick Sweeting
2025-12-28 03:39:59 -08:00
parent a38624a4dd
commit 50e527ec65
156 changed files with 10275 additions and 7149 deletions

View File

@@ -1,33 +1,30 @@
"""
WSGI config for archivebox project.
ASGI config for archivebox project.
It exposes the WSGI callable as a module-level variable named ``application``.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
https://docs.djangoproject.com/en/stable/howto/deployment/asgi/
"""
from archivebox.config.django import setup_django
setup_django(in_memory_db=False, check_db=True)
# from channels.auth import AuthMiddlewareStack
# from channels.security.websocket import AllowedHostsOriginValidator
from channels.routing import ProtocolTypeRouter # , URLRouter
from django.core.asgi import get_asgi_application
# Standard Django ASGI application (no websockets/channels needed)
application = get_asgi_application()
# If websocket support is needed later, install channels and use:
# from channels.routing import ProtocolTypeRouter, URLRouter
# from channels.auth import AuthMiddlewareStack
# from channels.security.websocket import AllowedHostsOriginValidator
# from core.routing import websocket_urlpatterns
django_asgi_app = get_asgi_application()
application = ProtocolTypeRouter(
{
"http": django_asgi_app,
# only if we need websocket support later:
# "websocket": AllowedHostsOriginValidator(
# AuthMiddlewareStack(URLRouter(websocket_urlpatterns))
# ),
}
)
#
# application = ProtocolTypeRouter({
# "http": get_asgi_application(),
# "websocket": AllowedHostsOriginValidator(
# AuthMiddlewareStack(URLRouter(websocket_urlpatterns))
# ),
# })

View File

@@ -69,7 +69,7 @@ class Migration(migrations.Migration):
model_name='archiveresult',
name='binary',
field=models.ForeignKey(
'machine.InstalledBinary',
'machine.Binary',
on_delete=models.SET_NULL,
null=True,
blank=True,

View File

@@ -0,0 +1,27 @@
# Generated by Django 6.0 on 2025-12-27
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0030_migrate_output_field'),
]
operations = [
migrations.AddField(
model_name='snapshot',
name='parent_snapshot',
field=models.ForeignKey(
blank=True,
db_index=True,
help_text='Parent snapshot that discovered this URL (for recursive crawling)',
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name='child_snapshots',
to='core.snapshot'
),
),
]

View File

@@ -0,0 +1,58 @@
# Generated by Django 6.0 on 2025-12-28 05:12
import django.db.models.deletion
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0031_snapshot_parent_snapshot'),
('crawls', '0004_alter_crawl_output_dir'),
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='binary',
field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
),
migrations.AlterField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
),
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
),
]

View File

@@ -0,0 +1,29 @@
# Generated by Django 6.0 on 2025-12-28
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0032_alter_archiveresult_binary_and_more'),
]
operations = [
migrations.RenameField(
model_name='archiveresult',
old_name='extractor',
new_name='plugin',
),
migrations.AddField(
model_name='archiveresult',
name='hook_name',
field=models.CharField(
blank=True,
default='',
max_length=255,
db_index=True,
help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
),
),
]

File diff suppressed because it is too large Load Diff

View File

@@ -57,7 +57,7 @@ INSTALLED_APPS = [
"django_object_actions", # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
# Our ArchiveBox-provided apps
"config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
"machine", # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
"machine", # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
"workers", # handles starting and managing background workers and processes (orchestrators and actors)
"crawls", # handles Crawl and CrawlSchedule models and management
"personas", # handles Persona and session management

View File

@@ -64,16 +64,10 @@ class SnapshotMachine(StateMachine, strict_states=True):
if self.snapshot.pending_archiveresults().exists():
return False
# Check for background hooks that are still running
started_results = self.snapshot.archiveresult_set.filter(
status=ArchiveResult.StatusChoices.STARTED
)
for result in started_results:
if not result.check_background_completed():
return False # Still running
# Completed - finalize it
result.finalize_background_hook()
# Don't wait for background hooks - they'll be cleaned up on entering sealed state
# Background hooks in STARTED state are excluded by pending_archiveresults()
# (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
# we can transition to sealed and cleanup() will kill the background hooks
# otherwise archiveresults exist and are all finished, so it's finished
return True
@@ -108,6 +102,9 @@ class SnapshotMachine(StateMachine, strict_states=True):
@sealed.enter
def enter_sealed(self):
# Clean up background hooks
self.snapshot.cleanup()
# Suppressed: state transition logs
self.snapshot.update_for_workers(
retry_at=None,