mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
fix: rename --plugin to --plugins for consistency
Changed from singular --plugin to plural --plugins in both snapshot and extract commands to match the pattern in archivebox add command. Updated to accept comma-separated plugin names (e.g., --plugins=screenshot,singlefile,title). - Updated CLI option from --plugin to --plugins - Added parsing for comma-separated plugin names - Updated function signatures and logic to handle multiple plugins - Updated help text, docstrings, and examples Co-authored-by: Nick Sweeting <pirate@users.noreply.github.com>
This commit is contained in:
@@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
"""
|
"""
|
||||||
archivebox extract [snapshot_ids...] [--plugin=NAME]
|
archivebox extract [snapshot_ids...] [--plugins=NAMES]
|
||||||
|
|
||||||
Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.
|
Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.
|
||||||
|
|
||||||
@@ -20,8 +20,8 @@ Examples:
|
|||||||
# Pipe from snapshot command
|
# Pipe from snapshot command
|
||||||
archivebox snapshot https://example.com | archivebox extract
|
archivebox snapshot https://example.com | archivebox extract
|
||||||
|
|
||||||
# Run specific plugin only
|
# Run specific plugins only
|
||||||
archivebox extract --plugin=screenshot 01234567-89ab-cdef-0123-456789abcdef
|
archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef
|
||||||
|
|
||||||
# Chain commands
|
# Chain commands
|
||||||
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
||||||
@@ -76,7 +76,7 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
|||||||
|
|
||||||
def run_plugins(
|
def run_plugins(
|
||||||
args: tuple,
|
args: tuple,
|
||||||
plugin: str = '',
|
plugins: str = '',
|
||||||
wait: bool = True,
|
wait: bool = True,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""
|
"""
|
||||||
@@ -147,21 +147,25 @@ def run_plugins(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Create pending ArchiveResults if needed
|
# Create pending ArchiveResults if needed
|
||||||
if plugin:
|
if plugins:
|
||||||
# Only create for specific plugin
|
# Parse comma-separated plugins list
|
||||||
result, created = ArchiveResult.objects.get_or_create(
|
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()]
|
||||||
snapshot=snapshot,
|
|
||||||
plugin=plugin,
|
# Only create for specific plugins
|
||||||
defaults={
|
for plugin_name in plugins_list:
|
||||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
result, created = ArchiveResult.objects.get_or_create(
|
||||||
'retry_at': timezone.now(),
|
snapshot=snapshot,
|
||||||
}
|
plugin=plugin_name,
|
||||||
)
|
defaults={
|
||||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||||
# Reset for retry
|
'retry_at': timezone.now(),
|
||||||
result.status = ArchiveResult.StatusChoices.QUEUED
|
}
|
||||||
result.retry_at = timezone.now()
|
)
|
||||||
result.save()
|
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||||
|
# Reset for retry
|
||||||
|
result.status = ArchiveResult.StatusChoices.QUEUED
|
||||||
|
result.retry_at = timezone.now()
|
||||||
|
result.save()
|
||||||
else:
|
else:
|
||||||
# Create all pending plugins
|
# Create all pending plugins
|
||||||
snapshot.create_pending_archiveresults()
|
snapshot.create_pending_archiveresults()
|
||||||
@@ -191,8 +195,10 @@ def run_plugins(
|
|||||||
try:
|
try:
|
||||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||||
results = snapshot.archiveresult_set.all()
|
results = snapshot.archiveresult_set.all()
|
||||||
if plugin:
|
if plugins:
|
||||||
results = results.filter(plugin=plugin)
|
# Parse comma-separated plugins list
|
||||||
|
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()]
|
||||||
|
results = results.filter(plugin__in=plugins_list)
|
||||||
|
|
||||||
for result in results:
|
for result in results:
|
||||||
if is_tty:
|
if is_tty:
|
||||||
@@ -222,10 +228,10 @@ def is_archiveresult_id(value: str) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option('--plugin', '-p', default='', help='Run only this plugin (e.g., screenshot, singlefile)')
|
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
|
||||||
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
|
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
|
||||||
@click.argument('args', nargs=-1)
|
@click.argument('args', nargs=-1)
|
||||||
def main(plugin: str, wait: bool, args: tuple):
|
def main(plugins: str, wait: bool, args: tuple):
|
||||||
"""Run plugins on Snapshots, or process existing ArchiveResults by ID"""
|
"""Run plugins on Snapshots, or process existing ArchiveResults by ID"""
|
||||||
from archivebox.misc.jsonl import read_args_or_stdin
|
from archivebox.misc.jsonl import read_args_or_stdin
|
||||||
|
|
||||||
@@ -254,7 +260,7 @@ def main(plugin: str, wait: bool, args: tuple):
|
|||||||
sys.exit(exit_code)
|
sys.exit(exit_code)
|
||||||
else:
|
else:
|
||||||
# Default behavior: run plugins on Snapshots from input
|
# Default behavior: run plugins on Snapshots from input
|
||||||
sys.exit(run_plugins(args, plugin=plugin, wait=wait))
|
sys.exit(run_plugins(args, plugins=plugins, wait=wait))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
"""
|
"""
|
||||||
archivebox snapshot [urls_or_crawl_ids...] [--tag=TAG] [--plugin=NAME]
|
archivebox snapshot [urls_or_crawl_ids...] [--tag=TAG] [--plugins=NAMES]
|
||||||
|
|
||||||
Create Snapshots from URLs or Crawl jobs. Accepts URLs, Crawl JSONL, or Crawl IDs.
|
Create Snapshots from URLs or Crawl jobs. Accepts URLs, Crawl JSONL, or Crawl IDs.
|
||||||
|
|
||||||
@@ -24,8 +24,8 @@ Examples:
|
|||||||
# Chain with extract
|
# Chain with extract
|
||||||
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
||||||
|
|
||||||
# Run specific plugin after creating snapshots
|
# Run specific plugins after creating snapshots
|
||||||
archivebox snapshot --plugin=screenshot https://example.com
|
archivebox snapshot --plugins=screenshot,singlefile https://example.com
|
||||||
|
|
||||||
# Process existing Snapshot by ID
|
# Process existing Snapshot by ID
|
||||||
archivebox snapshot 01234567-89ab-cdef-0123-456789abcdef
|
archivebox snapshot 01234567-89ab-cdef-0123-456789abcdef
|
||||||
@@ -74,14 +74,14 @@ def process_snapshot_by_id(snapshot_id: str) -> int:
|
|||||||
def create_snapshots(
|
def create_snapshots(
|
||||||
args: tuple,
|
args: tuple,
|
||||||
tag: str = '',
|
tag: str = '',
|
||||||
plugin: str = '',
|
plugins: str = '',
|
||||||
created_by_id: Optional[int] = None,
|
created_by_id: Optional[int] = None,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""
|
"""
|
||||||
Create Snapshots from URLs, Crawl JSONL, or Crawl IDs.
|
Create Snapshots from URLs, Crawl JSONL, or Crawl IDs.
|
||||||
|
|
||||||
Reads from args or stdin, creates Snapshot objects, outputs JSONL.
|
Reads from args or stdin, creates Snapshot objects, outputs JSONL.
|
||||||
If --plugin is passed, also runs specified plugin (blocking).
|
If --plugins is passed, also runs specified plugins (blocking).
|
||||||
|
|
||||||
Exit codes:
|
Exit codes:
|
||||||
0: Success
|
0: Success
|
||||||
@@ -179,28 +179,32 @@ def create_snapshots(
|
|||||||
for snapshot in created_snapshots:
|
for snapshot in created_snapshots:
|
||||||
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
|
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
|
||||||
|
|
||||||
# If --plugin is passed, create ArchiveResults and run the orchestrator
|
# If --plugins is passed, create ArchiveResults and run the orchestrator
|
||||||
if plugin:
|
if plugins:
|
||||||
from archivebox.core.models import ArchiveResult
|
from archivebox.core.models import ArchiveResult
|
||||||
from archivebox.workers.orchestrator import Orchestrator
|
from archivebox.workers.orchestrator import Orchestrator
|
||||||
|
|
||||||
# Create ArchiveResults for the specific plugin on each snapshot
|
# Parse comma-separated plugins list
|
||||||
for snapshot in created_snapshots:
|
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()]
|
||||||
result, created = ArchiveResult.objects.get_or_create(
|
|
||||||
snapshot=snapshot,
|
|
||||||
plugin=plugin,
|
|
||||||
defaults={
|
|
||||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
|
||||||
'retry_at': timezone.now(),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
|
||||||
# Reset for retry
|
|
||||||
result.status = ArchiveResult.StatusChoices.QUEUED
|
|
||||||
result.retry_at = timezone.now()
|
|
||||||
result.save()
|
|
||||||
|
|
||||||
rprint(f'[blue]Running plugin: {plugin}...[/blue]', file=sys.stderr)
|
# Create ArchiveResults for the specific plugins on each snapshot
|
||||||
|
for snapshot in created_snapshots:
|
||||||
|
for plugin_name in plugins_list:
|
||||||
|
result, created = ArchiveResult.objects.get_or_create(
|
||||||
|
snapshot=snapshot,
|
||||||
|
plugin=plugin_name,
|
||||||
|
defaults={
|
||||||
|
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||||
|
'retry_at': timezone.now(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||||
|
# Reset for retry
|
||||||
|
result.status = ArchiveResult.StatusChoices.QUEUED
|
||||||
|
result.retry_at = timezone.now()
|
||||||
|
result.save()
|
||||||
|
|
||||||
|
rprint(f'[blue]Running plugins: {plugins}...[/blue]', file=sys.stderr)
|
||||||
orchestrator = Orchestrator(exit_on_idle=True)
|
orchestrator = Orchestrator(exit_on_idle=True)
|
||||||
orchestrator.runloop()
|
orchestrator.runloop()
|
||||||
|
|
||||||
@@ -220,9 +224,9 @@ def is_snapshot_id(value: str) -> bool:
|
|||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add to each snapshot')
|
@click.option('--tag', '-t', default='', help='Comma-separated tags to add to each snapshot')
|
||||||
@click.option('--plugin', '-p', default='', help='Run only this plugin after creating snapshots (e.g., screenshot, singlefile)')
|
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run after creating snapshots (e.g., screenshot,singlefile)')
|
||||||
@click.argument('args', nargs=-1)
|
@click.argument('args', nargs=-1)
|
||||||
def main(tag: str, plugin: str, args: tuple):
|
def main(tag: str, plugins: str, args: tuple):
|
||||||
"""Create Snapshots from URLs/Crawls, or process existing Snapshots by ID"""
|
"""Create Snapshots from URLs/Crawls, or process existing Snapshots by ID"""
|
||||||
from archivebox.misc.jsonl import read_args_or_stdin
|
from archivebox.misc.jsonl import read_args_or_stdin
|
||||||
|
|
||||||
@@ -256,7 +260,7 @@ def main(tag: str, plugin: str, args: tuple):
|
|||||||
sys.exit(exit_code)
|
sys.exit(exit_code)
|
||||||
else:
|
else:
|
||||||
# Create new Snapshots from URLs or Crawls
|
# Create new Snapshots from URLs or Crawls
|
||||||
sys.exit(create_snapshots(args, tag=tag, plugin=plugin))
|
sys.exit(create_snapshots(args, tag=tag, plugins=plugins))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
Reference in New Issue
Block a user