wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -4,29 +4,46 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox worker'
import sys
import json
import rich_click as click
from archivebox.misc.util import docstring
def worker(worker_type: str, daemon: bool = False, plugin: str | None = None):
"""
Start a worker process to process items from the queue.
Worker types:
- crawl: Process Crawl objects (parse seeds, create snapshots)
- snapshot: Process Snapshot objects (create archive results)
- archiveresult: Process ArchiveResult objects (run plugins)
Workers poll the database for queued items, claim them atomically,
and spawn subprocess tasks to handle each item.
"""
from workers.worker import get_worker_class
WorkerClass = get_worker_class(worker_type)
# Build kwargs
kwargs = {'daemon': daemon}
if plugin and worker_type == 'archiveresult':
kwargs['extractor'] = plugin # internal field still called extractor
# Create and run worker
worker_instance = WorkerClass(**kwargs)
worker_instance.runloop()
@click.command()
@click.argument('worker_type')
@click.option('--wait-for-first-event', is_flag=True)
@click.option('--exit-on-idle', is_flag=True)
def main(worker_type: str, wait_for_first_event: bool, exit_on_idle: bool):
"""Start an ArchiveBox worker process of the given type"""
from workers.worker import get_worker_type
# allow piping in events to process from stdin
# if not sys.stdin.isatty():
# for line in sys.stdin.readlines():
# Event.dispatch(event=json.loads(line), parent=None)
# run the actor
Worker = get_worker_type(worker_type)
for event in Worker.run(wait_for_first_event=wait_for_first_event, exit_on_idle=exit_on_idle):
print(event)
@click.argument('worker_type', type=click.Choice(['crawl', 'snapshot', 'archiveresult']))
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
@click.option('--plugin', '-p', default=None, help='Filter by plugin (archiveresult only)')
@docstring(worker.__doc__)
def main(worker_type: str, daemon: bool, plugin: str | None):
"""Start an ArchiveBox worker process"""
worker(worker_type, daemon=daemon, plugin=plugin)
if __name__ == '__main__':