mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
add new core and crawsl statemachine manager
This commit is contained in:
@@ -29,7 +29,7 @@ def get_EXTRACTORS():
|
||||
'singlefile': SINGLEFILE_EXTRACTOR,
|
||||
}
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_INSTALLED_APPS():
|
||||
# # needed to load ./models.py
|
||||
# return [__package__]
|
||||
@abx.hookimpl
|
||||
def get_INSTALLED_APPS():
|
||||
# needed to load ./models.py
|
||||
return [__package__]
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
__package__ = 'abx_plugin_singlefile'
|
||||
|
||||
from typing import ClassVar
|
||||
from django.db.models import QuerySet
|
||||
from django.utils.functional import classproperty
|
||||
|
||||
from actors.actor import ActorType
|
||||
|
||||
from .models import SinglefileResult
|
||||
|
||||
|
||||
class SinglefileActor(ActorType[SinglefileResult]):
|
||||
CLAIM_ORDER: ClassVar[str] = 'created_at DESC'
|
||||
CLAIM_WHERE: ClassVar[str] = 'status = "queued" AND extractor = "favicon"'
|
||||
CLAIM_SET: ClassVar[str] = 'status = "started"'
|
||||
|
||||
@classproperty
|
||||
def QUERYSET(cls) -> QuerySet:
|
||||
return SinglefileResult.objects.filter(status='queued')
|
||||
|
||||
def tick(self, obj: SinglefileResult):
|
||||
print(f'[grey53]{self}.tick({obj.abid or obj.id}, status={obj.status}) remaining:[/grey53]', self.get_queue().count())
|
||||
updated = SinglefileResult.objects.filter(id=obj.id, status='started').update(status='success') == 1
|
||||
if not updated:
|
||||
raise Exception(f'Failed to update {obj.abid or obj.id}, interrupted by another actor writing to the same object')
|
||||
obj.refresh_from_db()
|
||||
obj.save()
|
||||
@@ -20,6 +20,17 @@ from django.urls import reverse_lazy
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
# Glossary:
|
||||
# - startup: when a new process is spawned
|
||||
# - shutdown: when a process is exiting
|
||||
# - start: at the beginning of some python code block
|
||||
# - end: at the end of some python code block
|
||||
# - queue: a django queryset of objects of a single type that are waiting to be processed
|
||||
# - actor: a long-running daemon process that wakes up and processes a single object from a queue at a time
|
||||
# - plugin: a python package that defines some hookimpls based on hookspecs exposed by ABX
|
||||
# - object: an instance of a django model that represents a single row in the database
|
||||
|
||||
|
||||
# ORCHESTRATOR:
|
||||
# An orchestrator is a single long-running daemon process that manages spawning and killing actors for different queues of objects.
|
||||
# The orchestrator first starts when the archivebox starts, and it stops when archivebox is killed.
|
||||
@@ -74,8 +85,8 @@ from pathlib import Path
|
||||
# On startup an actor should fire abx.pm.hook.on_actor_startup(object) and on exit it should fire abx.pm.hook.on_actor_exit(object) (both syncronous hooks that can be used by plugins to register any startup/cleanup code).
|
||||
# An ActorType defines the following hookspecs for plugins to hook into its behavior:
|
||||
# - abx.pm.hook.on_actor_startup(actor, queue)
|
||||
# - abx.pm.hook.on_actor_tick_started(actor, object)
|
||||
# - abx.pm.hook.on_actor_tick_finished(actor, object)
|
||||
# - abx.pm.hook.on_actor_tick_start(actor, object)
|
||||
# - abx.pm.hook.on_actor_tick_end(actor, object)
|
||||
# - abx.pm.hook.on_actor_tick_exception(actor, object, exception)
|
||||
# - abx.pm.hook.on_actor_shutdown(actor)
|
||||
|
||||
@@ -107,8 +118,8 @@ from pathlib import Path
|
||||
# - external API calls (e.g. uploading to s3, firing a webhook, writing to a logfile, etc.)
|
||||
# - DO NOT use side effects to directly mutate other objects state or trigger other state transitions
|
||||
# ABX defines the following hookspecs for plugins to hook into transition behavior:
|
||||
# - abx.pm.hook.on_transition_<objecttype>_from_abx_to_xyz_started(object)
|
||||
# - abx.pm.hook.on_transition_<objecttype>_from_abx_to_xyz_succeeded(object)
|
||||
# - abx.pm.hook.on_transition_<objecttype>_from_abx_to_xyz_start(object)
|
||||
# - abx.pm.hook.on_transition_<objecttype>_from_abx_to_xyz_end(object)
|
||||
|
||||
# READ:
|
||||
# A read() method is a function defined for a given ActorType that performs a single read from the DB and/or other read models like django cache, filesystem, in-memory caches, etc.
|
||||
|
||||
Reference in New Issue
Block a user