add new core and crawsl statemachine manager

This commit is contained in:
Nick Sweeting
2024-11-03 00:41:11 -07:00
parent 41efd010f0
commit 48f8416762
18 changed files with 798 additions and 374 deletions

View File

@@ -29,7 +29,7 @@ def get_EXTRACTORS():
'singlefile': SINGLEFILE_EXTRACTOR,
}
# @abx.hookimpl
# def get_INSTALLED_APPS():
# # needed to load ./models.py
# return [__package__]
@abx.hookimpl
def get_INSTALLED_APPS():
# needed to load ./models.py
return [__package__]

View File

@@ -0,0 +1,27 @@
__package__ = 'abx_plugin_singlefile'
from typing import ClassVar
from django.db.models import QuerySet
from django.utils.functional import classproperty
from actors.actor import ActorType
from .models import SinglefileResult
class SinglefileActor(ActorType[SinglefileResult]):
CLAIM_ORDER: ClassVar[str] = 'created_at DESC'
CLAIM_WHERE: ClassVar[str] = 'status = "queued" AND extractor = "favicon"'
CLAIM_SET: ClassVar[str] = 'status = "started"'
@classproperty
def QUERYSET(cls) -> QuerySet:
return SinglefileResult.objects.filter(status='queued')
def tick(self, obj: SinglefileResult):
print(f'[grey53]{self}.tick({obj.abid or obj.id}, status={obj.status}) remaining:[/grey53]', self.get_queue().count())
updated = SinglefileResult.objects.filter(id=obj.id, status='started').update(status='success') == 1
if not updated:
raise Exception(f'Failed to update {obj.abid or obj.id}, interrupted by another actor writing to the same object')
obj.refresh_from_db()
obj.save()

View File

@@ -20,6 +20,17 @@ from django.urls import reverse_lazy
from pathlib import Path
# Glossary:
# - startup: when a new process is spawned
# - shutdown: when a process is exiting
# - start: at the beginning of some python code block
# - end: at the end of some python code block
# - queue: a django queryset of objects of a single type that are waiting to be processed
# - actor: a long-running daemon process that wakes up and processes a single object from a queue at a time
# - plugin: a python package that defines some hookimpls based on hookspecs exposed by ABX
# - object: an instance of a django model that represents a single row in the database
# ORCHESTRATOR:
# An orchestrator is a single long-running daemon process that manages spawning and killing actors for different queues of objects.
# The orchestrator first starts when the archivebox starts, and it stops when archivebox is killed.
@@ -74,8 +85,8 @@ from pathlib import Path
# On startup an actor should fire abx.pm.hook.on_actor_startup(object) and on exit it should fire abx.pm.hook.on_actor_exit(object) (both syncronous hooks that can be used by plugins to register any startup/cleanup code).
# An ActorType defines the following hookspecs for plugins to hook into its behavior:
# - abx.pm.hook.on_actor_startup(actor, queue)
# - abx.pm.hook.on_actor_tick_started(actor, object)
# - abx.pm.hook.on_actor_tick_finished(actor, object)
# - abx.pm.hook.on_actor_tick_start(actor, object)
# - abx.pm.hook.on_actor_tick_end(actor, object)
# - abx.pm.hook.on_actor_tick_exception(actor, object, exception)
# - abx.pm.hook.on_actor_shutdown(actor)
@@ -107,8 +118,8 @@ from pathlib import Path
# - external API calls (e.g. uploading to s3, firing a webhook, writing to a logfile, etc.)
# - DO NOT use side effects to directly mutate other objects state or trigger other state transitions
# ABX defines the following hookspecs for plugins to hook into transition behavior:
# - abx.pm.hook.on_transition_<objecttype>_from_abx_to_xyz_started(object)
# - abx.pm.hook.on_transition_<objecttype>_from_abx_to_xyz_succeeded(object)
# - abx.pm.hook.on_transition_<objecttype>_from_abx_to_xyz_start(object)
# - abx.pm.hook.on_transition_<objecttype>_from_abx_to_xyz_end(object)
# READ:
# A read() method is a function defined for a given ActorType that performs a single read from the DB and/or other read models like django cache, filesystem, in-memory caches, etc.