mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
tons of fixes with codex
This commit is contained in:
@@ -4,6 +4,7 @@ __package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox add'
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
@@ -14,7 +15,7 @@ from django.db.models import QuerySet
|
||||
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox import CONSTANTS
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
from archivebox.config.common import ARCHIVING_CONFIG, SERVER_CONFIG
|
||||
from archivebox.config.permissions import USER, HOSTNAME
|
||||
|
||||
|
||||
@@ -57,8 +58,11 @@ def add(urls: str | list[str],
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
from archivebox.misc.system import get_dir_size
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
started_at = timezone.now()
|
||||
|
||||
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
|
||||
@@ -127,11 +131,56 @@ def add(urls: str | list[str],
|
||||
# Background mode: just queue work and return (orchestrator via server will pick it up)
|
||||
print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
|
||||
else:
|
||||
# Foreground mode: run CrawlWorker inline until all work is done
|
||||
print(f'[green]\\[*] Starting worker to process crawl...[/green]')
|
||||
from archivebox.workers.worker import CrawlWorker
|
||||
worker = CrawlWorker(crawl_id=str(crawl.id), worker_id=0)
|
||||
worker.runloop() # Block until complete
|
||||
# Foreground mode: run full orchestrator until all work is done
|
||||
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id))
|
||||
orchestrator.runloop() # Block until complete
|
||||
|
||||
# Print summary for foreground runs
|
||||
try:
|
||||
crawl.refresh_from_db()
|
||||
snapshots_count = crawl.snapshot_set.count()
|
||||
try:
|
||||
total_bytes = sum(s.archive_size for s in crawl.snapshot_set.all())
|
||||
except Exception:
|
||||
total_bytes, _, _ = get_dir_size(crawl.output_dir)
|
||||
total_size = printable_filesize(total_bytes)
|
||||
total_time = timezone.now() - started_at
|
||||
total_seconds = int(total_time.total_seconds())
|
||||
mins, secs = divmod(total_seconds, 60)
|
||||
hours, mins = divmod(mins, 60)
|
||||
if hours:
|
||||
duration_str = f"{hours}h {mins}m {secs}s"
|
||||
elif mins:
|
||||
duration_str = f"{mins}m {secs}s"
|
||||
else:
|
||||
duration_str = f"{secs}s"
|
||||
|
||||
# Output dir relative to DATA_DIR
|
||||
try:
|
||||
rel_output = Path(crawl.output_dir).relative_to(CONSTANTS.DATA_DIR)
|
||||
rel_output_str = f'./{rel_output}'
|
||||
except Exception:
|
||||
rel_output_str = str(crawl.output_dir)
|
||||
|
||||
# Build admin URL from SERVER_CONFIG
|
||||
bind_addr = SERVER_CONFIG.BIND_ADDR
|
||||
if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
|
||||
base_url = bind_addr
|
||||
else:
|
||||
base_url = f'http://{bind_addr}'
|
||||
admin_url = f'{base_url}/admin/crawls/crawl/{crawl.id}/change/'
|
||||
|
||||
print('\n[bold]crawl output saved to:[/bold]')
|
||||
print(f' {rel_output_str}')
|
||||
print(f' {admin_url}')
|
||||
print(f'\n[bold]total urls snapshotted:[/bold] {snapshots_count}')
|
||||
print(f'[bold]total size:[/bold] {total_size}')
|
||||
print(f'[bold]total time:[/bold] {duration_str}')
|
||||
except Exception:
|
||||
# Summary is best-effort; avoid failing the command if something goes wrong
|
||||
pass
|
||||
|
||||
# 6. Return the list of Snapshots in this crawl
|
||||
return crawl.snapshot_set.all()
|
||||
|
||||
@@ -205,7 +205,6 @@ def pluginmap(
|
||||
|
||||
from archivebox.hooks import (
|
||||
discover_hooks,
|
||||
extract_step,
|
||||
is_background_hook,
|
||||
BUILTIN_PLUGINS_DIR,
|
||||
USER_PLUGINS_DIR,
|
||||
@@ -277,16 +276,14 @@ def pluginmap(
|
||||
# Build hook info list
|
||||
hook_infos = []
|
||||
for hook_path in hooks:
|
||||
# Get plugin name from parent directory (e.g., 'wget' from 'plugins/wget/on_Snapshot__61_wget.py')
|
||||
# Get plugin name from parent directory (e.g., 'wget' from 'plugins/wget/on_Snapshot__06_wget.bg.py')
|
||||
plugin_name = hook_path.parent.name
|
||||
step = extract_step(hook_path.name)
|
||||
is_bg = is_background_hook(hook_path.name)
|
||||
|
||||
hook_infos.append({
|
||||
'path': str(hook_path),
|
||||
'name': hook_path.name,
|
||||
'plugin': plugin_name,
|
||||
'step': step,
|
||||
'is_background': is_bg,
|
||||
'extension': hook_path.suffix,
|
||||
})
|
||||
@@ -316,20 +313,18 @@ def pluginmap(
|
||||
show_header=True,
|
||||
header_style='bold magenta',
|
||||
)
|
||||
table.add_column('Step', justify='center', width=6)
|
||||
table.add_column('Plugin', style='cyan', width=20)
|
||||
table.add_column('Hook Name', style='green')
|
||||
table.add_column('BG', justify='center', width=4)
|
||||
table.add_column('Type', justify='center', width=5)
|
||||
|
||||
# Sort by step then by name
|
||||
sorted_hooks = sorted(hook_infos, key=lambda h: (h['step'], h['name']))
|
||||
# Sort lexicographically by hook name
|
||||
sorted_hooks = sorted(hook_infos, key=lambda h: h['name'])
|
||||
|
||||
for hook in sorted_hooks:
|
||||
bg_marker = '[yellow]bg[/yellow]' if hook['is_background'] else ''
|
||||
ext = hook['extension'].lstrip('.')
|
||||
table.add_row(
|
||||
str(hook['step']),
|
||||
hook['plugin'],
|
||||
hook['name'],
|
||||
bg_marker,
|
||||
@@ -347,7 +342,7 @@ def pluginmap(
|
||||
prnt(f'[bold]Total hooks discovered: {total_hooks}[/bold]')
|
||||
prnt()
|
||||
prnt('[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]')
|
||||
prnt('[dim] - XX: Two-digit order (first digit = step 0-9)[/dim]')
|
||||
prnt('[dim] - XX: Two-digit lexicographic order (00-99)[/dim]')
|
||||
prnt('[dim] - .bg: Background hook (non-blocking)[/dim]')
|
||||
prnt('[dim] - ext: py, sh, or js[/dim]')
|
||||
prnt()
|
||||
|
||||
Reference in New Issue
Block a user