tons of fixes with codex

This commit is contained in:
Nick Sweeting
2026-01-19 01:00:53 -08:00
parent eaf7256345
commit c7b2217cd6
184 changed files with 3943 additions and 2420 deletions

View File

@@ -4,6 +4,7 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox add'
import sys
from pathlib import Path
from typing import TYPE_CHECKING
@@ -14,7 +15,7 @@ from django.db.models import QuerySet
from archivebox.misc.util import enforce_types, docstring
from archivebox import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.config.common import ARCHIVING_CONFIG, SERVER_CONFIG
from archivebox.config.permissions import USER, HOSTNAME
@@ -57,8 +58,11 @@ def add(urls: str | list[str],
from archivebox.crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.workers.orchestrator import Orchestrator
from archivebox.misc.logging_util import printable_filesize
from archivebox.misc.system import get_dir_size
created_by_id = created_by_id or get_or_create_system_user_pk()
started_at = timezone.now()
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
@@ -127,11 +131,56 @@ def add(urls: str | list[str],
# Background mode: just queue work and return (orchestrator via server will pick it up)
print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
else:
# Foreground mode: run CrawlWorker inline until all work is done
print(f'[green]\\[*] Starting worker to process crawl...[/green]')
from archivebox.workers.worker import CrawlWorker
worker = CrawlWorker(crawl_id=str(crawl.id), worker_id=0)
worker.runloop() # Block until complete
# Foreground mode: run full orchestrator until all work is done
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
from archivebox.workers.orchestrator import Orchestrator
orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id))
orchestrator.runloop() # Block until complete
# Print summary for foreground runs
try:
crawl.refresh_from_db()
snapshots_count = crawl.snapshot_set.count()
try:
total_bytes = sum(s.archive_size for s in crawl.snapshot_set.all())
except Exception:
total_bytes, _, _ = get_dir_size(crawl.output_dir)
total_size = printable_filesize(total_bytes)
total_time = timezone.now() - started_at
total_seconds = int(total_time.total_seconds())
mins, secs = divmod(total_seconds, 60)
hours, mins = divmod(mins, 60)
if hours:
duration_str = f"{hours}h {mins}m {secs}s"
elif mins:
duration_str = f"{mins}m {secs}s"
else:
duration_str = f"{secs}s"
# Output dir relative to DATA_DIR
try:
rel_output = Path(crawl.output_dir).relative_to(CONSTANTS.DATA_DIR)
rel_output_str = f'./{rel_output}'
except Exception:
rel_output_str = str(crawl.output_dir)
# Build admin URL from SERVER_CONFIG
bind_addr = SERVER_CONFIG.BIND_ADDR
if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
base_url = bind_addr
else:
base_url = f'http://{bind_addr}'
admin_url = f'{base_url}/admin/crawls/crawl/{crawl.id}/change/'
print('\n[bold]crawl output saved to:[/bold]')
print(f' {rel_output_str}')
print(f' {admin_url}')
print(f'\n[bold]total urls snapshotted:[/bold] {snapshots_count}')
print(f'[bold]total size:[/bold] {total_size}')
print(f'[bold]total time:[/bold] {duration_str}')
except Exception:
# Summary is best-effort; avoid failing the command if something goes wrong
pass
# 6. Return the list of Snapshots in this crawl
return crawl.snapshot_set.all()

View File

@@ -205,7 +205,6 @@ def pluginmap(
from archivebox.hooks import (
discover_hooks,
extract_step,
is_background_hook,
BUILTIN_PLUGINS_DIR,
USER_PLUGINS_DIR,
@@ -277,16 +276,14 @@ def pluginmap(
# Build hook info list
hook_infos = []
for hook_path in hooks:
# Get plugin name from parent directory (e.g., 'wget' from 'plugins/wget/on_Snapshot__61_wget.py')
# Get plugin name from parent directory (e.g., 'wget' from 'plugins/wget/on_Snapshot__06_wget.bg.py')
plugin_name = hook_path.parent.name
step = extract_step(hook_path.name)
is_bg = is_background_hook(hook_path.name)
hook_infos.append({
'path': str(hook_path),
'name': hook_path.name,
'plugin': plugin_name,
'step': step,
'is_background': is_bg,
'extension': hook_path.suffix,
})
@@ -316,20 +313,18 @@ def pluginmap(
show_header=True,
header_style='bold magenta',
)
table.add_column('Step', justify='center', width=6)
table.add_column('Plugin', style='cyan', width=20)
table.add_column('Hook Name', style='green')
table.add_column('BG', justify='center', width=4)
table.add_column('Type', justify='center', width=5)
# Sort by step then by name
sorted_hooks = sorted(hook_infos, key=lambda h: (h['step'], h['name']))
# Sort lexicographically by hook name
sorted_hooks = sorted(hook_infos, key=lambda h: h['name'])
for hook in sorted_hooks:
bg_marker = '[yellow]bg[/yellow]' if hook['is_background'] else ''
ext = hook['extension'].lstrip('.')
table.add_row(
str(hook['step']),
hook['plugin'],
hook['name'],
bg_marker,
@@ -347,7 +342,7 @@ def pluginmap(
prnt(f'[bold]Total hooks discovered: {total_hooks}[/bold]')
prnt()
prnt('[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]')
prnt('[dim] - XX: Two-digit order (first digit = step 0-9)[/dim]')
prnt('[dim] - XX: Two-digit lexicographic order (00-99)[/dim]')
prnt('[dim] - .bg: Background hook (non-blocking)[/dim]')
prnt('[dim] - ext: py, sh, or js[/dim]')
prnt()

View File

@@ -258,11 +258,18 @@ def get_config(
# Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
if crawl and hasattr(crawl, "output_dir"):
config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir)
config['CRAWL_ID'] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get('CRAWL_ID')
# Apply snapshot config overrides (highest priority)
if snapshot and hasattr(snapshot, "config") and snapshot.config:
config.update(snapshot.config)
if snapshot:
config['SNAPSHOT_ID'] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get('SNAPSHOT_ID')
config['SNAPSHOT_DEPTH'] = int(getattr(snapshot, "depth", 0) or 0)
if getattr(snapshot, "crawl_id", None):
config['CRAWL_ID'] = str(snapshot.crawl_id)
# Normalize all aliases to canonical names (after all sources merged)
# This handles aliases that came from user/crawl/snapshot configs, not just env
try:

View File

@@ -344,6 +344,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
@property
def process_set(self):
"""Get all Process objects related to this snapshot's ArchiveResults."""
import json
import json
from archivebox.machine.models import Process
return Process.objects.filter(archiveresult__snapshot_id=self.id)
@@ -613,7 +615,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
ONLY used by: archivebox update (for orphan detection)
"""
import json
from archivebox.machine.models import Process
# Try index.jsonl first (new format), then index.json (legacy)
jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME
@@ -622,15 +624,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
data = None
if jsonl_path.exists():
try:
with open(jsonl_path) as f:
for line in f:
line = line.strip()
if line.startswith('{'):
record = json.loads(line)
if record.get('type') == 'Snapshot':
data = record
break
except (json.JSONDecodeError, OSError):
records = Process.parse_records_from_text(jsonl_path.read_text())
for record in records:
if record.get('type') == 'Snapshot':
data = record
break
except OSError:
pass
elif json_path.exists():
try:
@@ -689,7 +688,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
ONLY used by: archivebox update (for orphan import)
"""
import json
from archivebox.machine.models import Process
# Try index.jsonl first (new format), then index.json (legacy)
jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME
@@ -698,15 +697,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
data = None
if jsonl_path.exists():
try:
with open(jsonl_path) as f:
for line in f:
line = line.strip()
if line.startswith('{'):
record = json.loads(line)
if record.get('type') == 'Snapshot':
data = record
break
except (json.JSONDecodeError, OSError):
records = Process.parse_records_from_text(jsonl_path.read_text())
for record in records:
if record.get('type') == 'Snapshot':
data = record
break
except OSError:
pass
elif json_path.exists():
try:
@@ -1040,7 +1036,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
Returns dict with keys: 'snapshot', 'archive_results', 'binaries', 'processes'
"""
import json
from archivebox.machine.models import Process
from archivebox.misc.jsonl import (
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_BINARY, TYPE_PROCESS,
)
@@ -1056,24 +1052,17 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if not index_path.exists():
return result
with open(index_path, 'r') as f:
for line in f:
line = line.strip()
if not line or not line.startswith('{'):
continue
try:
record = json.loads(line)
record_type = record.get('type')
if record_type == TYPE_SNAPSHOT:
result['snapshot'] = record
elif record_type == TYPE_ARCHIVERESULT:
result['archive_results'].append(record)
elif record_type == TYPE_BINARY:
result['binaries'].append(record)
elif record_type == TYPE_PROCESS:
result['processes'].append(record)
except json.JSONDecodeError:
continue
records = Process.parse_records_from_text(index_path.read_text())
for record in records:
record_type = record.get('type')
if record_type == TYPE_SNAPSHOT:
result['snapshot'] = record
elif record_type == TYPE_ARCHIVERESULT:
result['archive_results'].append(record)
elif record_type == TYPE_BINARY:
result['binaries'].append(record)
elif record_type == TYPE_PROCESS:
result['processes'].append(record)
return result
@@ -1317,7 +1306,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
for plugin in all_plugins:
result = archive_results.get(plugin)
existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
icon = get_plugin_icon(plugin)
icon = mark_safe(get_plugin_icon(plugin))
# Skip plugins with empty icons that have no output
# (e.g., staticfile only shows when there's actual output)
@@ -1373,6 +1362,45 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return str(current_path)
def ensure_crawl_symlink(self) -> None:
"""Ensure snapshot is symlinked under its crawl output directory."""
import os
from pathlib import Path
from django.utils import timezone
from archivebox import DATA_DIR
from archivebox.crawls.models import Crawl
if not self.crawl_id:
return
crawl = Crawl.objects.filter(id=self.crawl_id).select_related('created_by').first()
if not crawl:
return
date_base = crawl.created_at or self.created_at or timezone.now()
date_str = date_base.strftime('%Y%m%d')
domain = self.extract_domain_from_url(self.url)
username = crawl.created_by.username if crawl.created_by_id else 'system'
crawl_dir = DATA_DIR / 'users' / username / 'crawls' / date_str / domain / str(crawl.id)
link_path = crawl_dir / 'snapshots' / domain / str(self.id)
link_parent = link_path.parent
link_parent.mkdir(parents=True, exist_ok=True)
target = Path(self.output_dir)
if link_path.exists() or link_path.is_symlink():
if link_path.is_symlink():
if link_path.resolve() == target.resolve():
return
link_path.unlink(missing_ok=True)
else:
return
rel_target = os.path.relpath(target, link_parent)
try:
link_path.symlink_to(rel_target, target_is_directory=True)
except OSError:
return
@cached_property
def archive_path(self):
return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'
@@ -1636,6 +1664,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if update_fields:
snapshot.save(update_fields=update_fields + ['modified_at'])
snapshot.ensure_crawl_symlink()
return snapshot
def create_pending_archiveresults(self) -> list['ArchiveResult']:
@@ -1689,7 +1719,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
"""
# Check if any ARs are still pending/started
pending = self.archiveresult_set.exclude(
status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES
status__in=ArchiveResult.FINAL_STATES
).exists()
return not pending
@@ -1754,7 +1784,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
- Plugins run in order (numeric prefix)
- Each plugin checks its dependencies at runtime
Dependency handling (e.g., chrome_session → screenshot):
Dependency handling (e.g., chrome → screenshot):
- Plugins check if required outputs exist before running
- If dependency output missing → plugin returns 'skipped'
- On retry, if dependency now succeeds → dependent can run
@@ -2117,6 +2147,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
TITLE_LOADING_MSG = 'Not yet archived...'
canonical = self.canonical_outputs()
preview_priority = [
'singlefile_path',
'screenshot_path',
'wget_path',
'dom_path',
'pdf_path',
'readability_path',
]
best_preview_path = next(
(canonical.get(key) for key in preview_priority if canonical.get(key)),
canonical.get('index_path', 'index.html'),
)
context = {
**self.to_dict(extended=True),
**{f'{k}_path': v for k, v in canonical.items()},
@@ -2132,6 +2174,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'oldest_archive_date': ts_to_date_str(self.oldest_archive_date),
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'best_preview_path': best_preview_path,
}
rendered_html = render_to_string('snapshot.html', context)
atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
@@ -2669,12 +2712,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
- end_ts, retry_at, cmd, cmd_version, binary FK
- Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
"""
import json
import mimetypes
from collections import defaultdict
from pathlib import Path
from django.utils import timezone
from archivebox.hooks import process_hook_records
from archivebox.hooks import process_hook_records, extract_records_from_process
from archivebox.machine.models import Process
plugin_dir = Path(self.pwd) if self.pwd else None
if not plugin_dir or not plugin_dir.exists():
@@ -2687,15 +2730,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Read and parse JSONL output from stdout.log
stdout_file = plugin_dir / 'stdout.log'
stdout = stdout_file.read_text() if stdout_file.exists() else ''
records = []
for line in stdout.splitlines():
if line.strip() and line.strip().startswith('{'):
try:
records.append(json.loads(line))
except json.JSONDecodeError:
continue
if self.process_id and self.process:
records = extract_records_from_process(self.process)
if not records:
stdout = stdout_file.read_text() if stdout_file.exists() else ''
records = Process.parse_records_from_text(stdout)
# Find ArchiveResult record and update status/output from it
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
@@ -2722,9 +2763,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self._set_binary_from_cmd(hook_data['cmd'])
# Note: cmd_version is derived from binary.version, not stored on Process
else:
# No ArchiveResult record = failed
self.status = self.StatusChoices.FAILED
self.output_str = 'Hook did not output ArchiveResult record'
# No ArchiveResult record: treat background hooks or clean exits as skipped
is_background = False
try:
from archivebox.hooks import is_background_hook
is_background = bool(self.hook_name and is_background_hook(self.hook_name))
except Exception:
pass
if is_background or (self.process_id and self.process and self.process.exit_code == 0):
self.status = self.StatusChoices.SKIPPED
self.output_str = 'Hook did not output ArchiveResult record'
else:
self.status = self.StatusChoices.FAILED
self.output_str = 'Hook did not output ArchiveResult record'
# Walk filesystem and populate output_files, output_size, output_mimetypes
exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
@@ -2793,14 +2845,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
}
process_hook_records(filtered_records, overrides=overrides)
# Cleanup PID files and empty logs
# Cleanup PID files (keep logs even if empty so they can be tailed)
pid_file = plugin_dir / 'hook.pid'
pid_file.unlink(missing_ok=True)
stderr_file = plugin_dir / 'stderr.log'
if stdout_file.exists() and stdout_file.stat().st_size == 0:
stdout_file.unlink()
if stderr_file.exists() and stderr_file.stat().st_size == 0:
stderr_file.unlink()
def _set_binary_from_cmd(self, cmd: list) -> None:
"""
@@ -3186,4 +3233,4 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
# Manually register state machines with python-statemachine registry
# (normally auto-discovered from statemachines.py, but we define them here for clarity)
registry.register(SnapshotMachine)
registry.register(ArchiveResultMachine)
registry.register(ArchiveResultMachine)

View File

@@ -436,6 +436,10 @@ SIGNAL_WEBHOOKS = {
},
}
# Avoid background threads touching sqlite connections (especially during tests/migrations).
if DATABASES["default"]["ENGINE"].endswith("sqlite3"):
SIGNAL_WEBHOOKS["TASK_HANDLER"] = "signal_webhooks.handlers.sync_task_handler"
################################################################################
### Admin Data View Settings
################################################################################

View File

@@ -120,7 +120,15 @@ class SnapshotView(View):
# Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
# Convert to base names for display ordering
all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()]
preferred_types = tuple(all_plugins)
preview_priority = [
'singlefile',
'screenshot',
'wget',
'dom',
'pdf',
'readability',
]
preferred_types = tuple(preview_priority + [p for p in all_plugins if p not in preview_priority])
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
best_result = {'path': 'None', 'result': None}

View File

@@ -313,6 +313,12 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if tags:
snapshot.save_tags(tags.split(','))
# Ensure crawl -> snapshot symlink exists for both new and existing snapshots
try:
snapshot.ensure_crawl_symlink()
except Exception:
pass
return created_snapshots
def run(self) -> 'Snapshot | None':
@@ -325,7 +331,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
The root Snapshot for this crawl, or None for system crawls that don't create snapshots
"""
import time
import json
from pathlib import Path
from archivebox.hooks import run_hook, discover_hooks, process_hook_records
from archivebox.config.configset import get_config
@@ -339,35 +344,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
# Get merged config with crawl context
config = get_config(crawl=self)
# Load all binaries.jsonl files from plugins
# This replaces individual on_Crawl install hooks with declarative configuration
from archivebox.hooks import BUILTIN_PLUGINS_DIR
from archivebox.machine.models import Machine
machine_id = str(Machine.current().id)
binaries_records = []
for binaries_file in BUILTIN_PLUGINS_DIR.glob('*/binaries.jsonl'):
try:
with open(binaries_file, 'r') as f:
for line in f:
line = line.strip()
if line and not line.startswith('#'):
try:
record = json.loads(line)
if record.get('type') == 'Binary':
record['machine_id'] = machine_id
binaries_records.append(record)
except json.JSONDecodeError:
pass
except Exception:
pass
# Process binary declarations before running hooks
if binaries_records:
overrides = {'crawl': self}
process_hook_records(binaries_records, overrides=overrides)
# Discover and run on_Crawl hooks
with open(debug_log, 'a') as f:
f.write(f'Discovering Crawl hooks...\n')
@@ -418,6 +394,34 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if stats:
print(f'[green]✓ Created: {stats}[/green]')
# Ensure any newly declared binaries are installed before creating snapshots
from archivebox.machine.models import Binary, Machine
from django.utils import timezone
machine = Machine.current()
while True:
pending_binaries = Binary.objects.filter(
machine=machine,
status=Binary.StatusChoices.QUEUED,
retry_at__lte=timezone.now(),
).order_by('retry_at')
if not pending_binaries.exists():
break
for binary in pending_binaries:
try:
binary.sm.tick()
except Exception:
continue
# Exit if nothing else is immediately retryable
if not Binary.objects.filter(
machine=machine,
status=Binary.StatusChoices.QUEUED,
retry_at__lte=timezone.now(),
).exists():
break
# Create snapshots from all URLs in self.urls
with open(debug_log, 'a') as f:
f.write(f'Creating snapshots from URLs...\n')

View File

@@ -15,29 +15,29 @@ Hook contract:
Exit: 0 = success, non-zero = failure
Execution order:
- Hooks are numbered 00-99 with first digit determining step (0-9)
- All hooks in a step can run in parallel
- Steps execute sequentially (step 0 → step 1 → ... → step 9)
- Background hooks (.bg suffix) don't block step advancement
- Hooks are named with two-digit prefixes (00-99) and sorted lexicographically by filename
- Foreground hooks run sequentially in that order
- Background hooks (.bg suffix) run concurrently and do not block foreground progress
- After all foreground hooks complete, background hooks receive SIGTERM and must finalize
- Failed extractors don't block subsequent extractors
Hook Naming Convention:
on_{ModelName}__{run_order}_{description}[.bg].{ext}
Examples:
on_Snapshot__00_setup.py # Step 0, runs first
on_Snapshot__20_chrome_tab.bg.js # Step 2, background (doesn't block)
on_Snapshot__50_screenshot.js # Step 5, foreground (blocks step)
on_Snapshot__63_media.bg.py # Step 6, background (long-running)
on_Snapshot__00_setup.py # runs first
on_Snapshot__10_chrome_tab.bg.js # background (doesn't block)
on_Snapshot__50_screenshot.js # foreground (blocks)
on_Snapshot__63_media.bg.py # background (long-running)
Dependency handling:
Extractor plugins that depend on other plugins' output should check at runtime:
```python
# Example: screenshot plugin depends on chrome plugin
chrome_session_dir = Path(os.environ.get('SNAPSHOT_DIR', '.')) / 'chrome_session'
if not (chrome_session_dir / 'session.json').exists():
print('{"status": "skipped", "output": "chrome_session not available"}')
chrome_dir = Path(os.environ.get('SNAPSHOT_DIR', '.')) / 'chrome'
if not (chrome_dir / 'cdp_url.txt').exists():
print('{"status": "skipped", "output": "chrome session not available"}')
sys.exit(1) # Exit non-zero so it gets retried later
```
@@ -50,7 +50,7 @@ API (all hook logic lives here):
discover_hooks(event) -> List[Path] Find hook scripts
run_hook(script, ...) -> HookResult Execute a hook script
run_hooks(event, ...) -> List[HookResult] Run all hooks for an event
extract_step(hook_name) -> int Get step number (0-9) from hook name
extract_step(hook_name) -> int Deprecated: get two-digit order prefix if present
is_background_hook(name) -> bool Check if hook is background (.bg suffix)
"""
@@ -67,6 +67,7 @@ from typing import List, Dict, Any, Optional, TypedDict
from django.conf import settings
from django.utils import timezone
from django.utils.safestring import mark_safe
# Plugin directories
@@ -80,51 +81,33 @@ USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins'
def extract_step(hook_name: str) -> int:
"""
Extract step number (0-9) from hook name.
Deprecated: return the two-digit order prefix as an integer (00-99) if present.
Hooks are numbered 00-99 with the first digit determining the step.
Pattern: on_{Model}__{XX}_{description}[.bg].{ext}
Args:
hook_name: Hook filename (e.g., 'on_Snapshot__50_wget.py')
Returns:
Step number 0-9, or 9 (default) for unnumbered hooks.
Examples:
extract_step('on_Snapshot__05_chrome.py') -> 0
extract_step('on_Snapshot__50_wget.py') -> 5
extract_step('on_Snapshot__63_media.bg.py') -> 6
extract_step('on_Snapshot__99_cleanup.sh') -> 9
extract_step('on_Snapshot__unnumbered.py') -> 9 (default)
Hook execution is based on lexicographic ordering of filenames; callers should
not rely on parsed numeric steps for ordering decisions.
"""
# Pattern matches __XX_ where XX is two digits
match = re.search(r'__(\d{2})_', hook_name)
if match:
two_digit = int(match.group(1))
step = two_digit // 10 # First digit is the step (0-9)
return step
# Log warning for unnumbered hooks and default to step 9
return int(match.group(1))
import sys
print(f"Warning: Hook '{hook_name}' has no step number (expected __XX_), defaulting to step 9", file=sys.stderr)
return 9
print(f"Warning: Hook '{hook_name}' has no order prefix (expected __XX_), defaulting to 99", file=sys.stderr)
return 99
def is_background_hook(hook_name: str) -> bool:
"""
Check if a hook is a background hook (doesn't block step advancement).
Check if a hook is a background hook (doesn't block foreground progression).
Background hooks have '.bg.' in their filename before the extension.
Args:
hook_name: Hook filename (e.g., 'on_Snapshot__20_chrome_tab.bg.js')
hook_name: Hook filename (e.g., 'on_Snapshot__10_chrome_tab.bg.js')
Returns:
True if background hook, False if foreground.
Examples:
is_background_hook('on_Snapshot__20_chrome_tab.bg.js') -> True
is_background_hook('on_Snapshot__10_chrome_tab.bg.js') -> True
is_background_hook('on_Snapshot__50_wget.py') -> False
is_background_hook('on_Snapshot__63_media.bg.py') -> True
"""
@@ -273,6 +256,7 @@ def run_hook(
"""
from archivebox.machine.models import Process, Machine
import time
import sys
start_time = time.time()
# Auto-detect timeout from plugin config if not explicitly provided
@@ -313,7 +297,7 @@ def run_hook(
if ext == '.sh':
cmd = ['bash', str(script)]
elif ext == '.py':
cmd = ['python3', str(script)]
cmd = [sys.executable, str(script)]
elif ext == '.js':
cmd = ['node', str(script)]
else:
@@ -393,10 +377,10 @@ def run_hook(
# Priority: config dict > Machine.config > derive from LIB_DIR
node_path = config.get('NODE_PATH')
if not node_path and lib_dir:
# Derive from LIB_DIR/npm/node_modules
# Derive from LIB_DIR/npm/node_modules (create if needed)
node_modules_dir = Path(lib_dir) / 'npm' / 'node_modules'
if node_modules_dir.exists():
node_path = str(node_modules_dir)
node_modules_dir.mkdir(parents=True, exist_ok=True)
node_path = str(node_modules_dir)
if not node_path:
try:
# Fallback to Machine.config
@@ -462,7 +446,7 @@ def run_hook(
cmd=cmd,
timeout=timeout,
status=Process.StatusChoices.EXITED,
exit_code=-1,
exit_code=1,
stderr=f'Failed to run hook: {type(e).__name__}: {e}',
)
return process
@@ -472,7 +456,6 @@ def extract_records_from_process(process: 'Process') -> List[Dict[str, Any]]:
"""
Extract JSONL records from a Process's stdout.
Uses the same parse_line() logic from misc/jsonl.py.
Adds plugin metadata to each record.
Args:
@@ -481,32 +464,20 @@ def extract_records_from_process(process: 'Process') -> List[Dict[str, Any]]:
Returns:
List of parsed JSONL records with plugin metadata
"""
from archivebox.misc.jsonl import parse_line
records = []
# Read stdout from process
stdout = process.stdout
if not stdout and process.stdout_file and process.stdout_file.exists():
stdout = process.stdout_file.read_text()
if not stdout:
return records
records = process.get_records()
if not records:
return []
# Extract plugin metadata from process.pwd and process.cmd
plugin_name = Path(process.pwd).name if process.pwd else 'unknown'
hook_name = Path(process.cmd[1]).name if len(process.cmd) > 1 else 'unknown'
plugin_hook = process.cmd[1] if len(process.cmd) > 1 else ''
# Parse each line as JSONL
for line in stdout.splitlines():
record = parse_line(line)
if record and 'type' in record:
# Add plugin metadata to record
record.setdefault('plugin', plugin_name)
record.setdefault('hook_name', hook_name)
record.setdefault('plugin_hook', plugin_hook)
records.append(record)
for record in records:
# Add plugin metadata to record
record.setdefault('plugin', plugin_name)
record.setdefault('hook_name', hook_name)
record.setdefault('plugin_hook', plugin_hook)
return records
@@ -538,18 +509,13 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
continue
try:
with open(urls_file, 'r') as f:
for line in f:
line = line.strip()
if line:
try:
entry = json.loads(line)
if entry.get('url'):
# Track which parser plugin found this URL
entry['plugin'] = subdir.name
urls.append(entry)
except json.JSONDecodeError:
continue
from archivebox.machine.models import Process
text = urls_file.read_text()
for entry in Process.parse_records_from_text(text):
if entry.get('url'):
# Track which parser plugin found this URL
entry['plugin'] = subdir.name
urls.append(entry)
except Exception:
pass
@@ -610,8 +576,8 @@ def get_plugins() -> List[str]:
The plugin name is the plugin directory name, not the hook script name.
Example:
archivebox/plugins/chrome_session/on_Snapshot__20_chrome_tab.bg.js
-> plugin = 'chrome_session'
archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
-> plugin = 'chrome'
Sorted alphabetically (plugins control their hook order via numeric prefixes in hook names).
"""
@@ -817,7 +783,7 @@ def discover_plugin_configs() -> Dict[str, Dict[str, Any]]:
Returns:
Dict mapping plugin names to their parsed JSONSchema configs.
e.g., {'wget': {...schema...}, 'chrome_session': {...schema...}}
e.g., {'wget': {...schema...}, 'chrome': {...schema...}}
Example config.json:
{
@@ -928,14 +894,10 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
if plugins_whitelist:
# PLUGINS whitelist is specified - only enable plugins in the list
plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()]
import sys
print(f"DEBUG: PLUGINS whitelist='{plugins_whitelist}', checking plugin '{plugin_name}', plugin_names={plugin_names}", file=sys.stderr)
if plugin_name.lower() not in plugin_names:
# Plugin not in whitelist - explicitly disabled
print(f"DEBUG: Plugin '{plugin_name}' NOT in whitelist, disabling", file=sys.stderr)
enabled = False
else:
print(f"DEBUG: Plugin '{plugin_name}' IS in whitelist, enabling", file=sys.stderr)
# Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED
enabled_key = f'{plugin_upper}_ENABLED'
enabled = config.get(enabled_key)
@@ -945,10 +907,8 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
enabled = enabled.lower() not in ('false', '0', 'no', '')
else:
# No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
import sys
enabled_key = f'{plugin_upper}_ENABLED'
enabled = config.get(enabled_key)
print(f"DEBUG: NO PLUGINS whitelist in config, checking {enabled_key}={enabled}", file=sys.stderr)
if enabled is None:
enabled = True
elif isinstance(enabled, str):
@@ -1064,10 +1024,10 @@ def get_plugin_icon(plugin: str) -> str:
# Try plugin-provided icon template
icon_template = get_plugin_template(plugin, 'icon', fallback=False)
if icon_template:
return icon_template.strip()
return mark_safe(icon_template.strip())
# Fall back to generic folder icon
return '📁'
return mark_safe('📁')
def get_all_plugin_icons() -> Dict[str, str]:
@@ -1204,18 +1164,14 @@ def create_model_record(record: Dict[str, Any]) -> Any:
return obj
elif record_type == 'Machine':
# Machine config update (special _method handling)
method = record.pop('_method', None)
if method == 'update':
key = record.get('key')
value = record.get('value')
if key and value:
machine = Machine.current()
if not machine.config:
machine.config = {}
machine.config[key] = value
machine.save(update_fields=['config'])
return machine
config_patch = record.get('config')
if isinstance(config_patch, dict) and config_patch:
machine = Machine.current()
if not machine.config:
machine.config = {}
machine.config.update(config_patch)
machine.save(update_fields=['config'])
return machine
return None
# Add more types as needed (Dependency, Snapshot, etc.)

View File

@@ -227,33 +227,45 @@ def get_os_info() -> Dict[str, Any]:
}
def get_host_stats() -> Dict[str, Any]:
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_usage = psutil.disk_usage(str(tmp_dir))
app_usage = psutil.disk_usage(str(PACKAGE_DIR))
data_usage = psutil.disk_usage(str(DATA_DIR))
mem_usage = psutil.virtual_memory()
swap_usage = psutil.swap_memory()
return {
"cpu_boot_time": datetime.fromtimestamp(psutil.boot_time()).isoformat(),
"cpu_count": psutil.cpu_count(logical=False),
"cpu_load": psutil.getloadavg(),
# "cpu_pct": psutil.cpu_percent(interval=1),
"mem_virt_used_pct": mem_usage.percent,
"mem_virt_used_gb": round(mem_usage.used / 1024 / 1024 / 1024, 3),
"mem_virt_free_gb": round(mem_usage.free / 1024 / 1024 / 1024, 3),
"mem_swap_used_pct": swap_usage.percent,
"mem_swap_used_gb": round(swap_usage.used / 1024 / 1024 / 1024, 3),
"mem_swap_free_gb": round(swap_usage.free / 1024 / 1024 / 1024, 3),
"disk_tmp_used_pct": tmp_usage.percent,
"disk_tmp_used_gb": round(tmp_usage.used / 1024 / 1024 / 1024, 3),
"disk_tmp_free_gb": round(tmp_usage.free / 1024 / 1024 / 1024, 3), # in GB
"disk_app_used_pct": app_usage.percent,
"disk_app_used_gb": round(app_usage.used / 1024 / 1024 / 1024, 3),
"disk_app_free_gb": round(app_usage.free / 1024 / 1024 / 1024, 3),
"disk_data_used_pct": data_usage.percent,
"disk_data_used_gb": round(data_usage.used / 1024 / 1024 / 1024, 3),
"disk_data_free_gb": round(data_usage.free / 1024 / 1024 / 1024, 3),
}
try:
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_usage = psutil.disk_usage(str(tmp_dir))
app_usage = psutil.disk_usage(str(PACKAGE_DIR))
data_usage = psutil.disk_usage(str(DATA_DIR))
mem_usage = psutil.virtual_memory()
try:
swap_usage = psutil.swap_memory()
swap_used_pct = swap_usage.percent
swap_used_gb = round(swap_usage.used / 1024 / 1024 / 1024, 3)
swap_free_gb = round(swap_usage.free / 1024 / 1024 / 1024, 3)
except OSError:
# Some sandboxed environments deny access to swap stats
swap_used_pct = 0.0
swap_used_gb = 0.0
swap_free_gb = 0.0
return {
"cpu_boot_time": datetime.fromtimestamp(psutil.boot_time()).isoformat(),
"cpu_count": psutil.cpu_count(logical=False),
"cpu_load": psutil.getloadavg(),
# "cpu_pct": psutil.cpu_percent(interval=1),
"mem_virt_used_pct": mem_usage.percent,
"mem_virt_used_gb": round(mem_usage.used / 1024 / 1024 / 1024, 3),
"mem_virt_free_gb": round(mem_usage.free / 1024 / 1024 / 1024, 3),
"mem_swap_used_pct": swap_used_pct,
"mem_swap_used_gb": swap_used_gb,
"mem_swap_free_gb": swap_free_gb,
"disk_tmp_used_pct": tmp_usage.percent,
"disk_tmp_used_gb": round(tmp_usage.used / 1024 / 1024 / 1024, 3),
"disk_tmp_free_gb": round(tmp_usage.free / 1024 / 1024 / 1024, 3), # in GB
"disk_app_used_pct": app_usage.percent,
"disk_app_used_gb": round(app_usage.used / 1024 / 1024 / 1024, 3),
"disk_app_free_gb": round(app_usage.free / 1024 / 1024 / 1024, 3),
"disk_data_used_pct": data_usage.percent,
"disk_data_used_gb": round(data_usage.used / 1024 / 1024 / 1024, 3),
"disk_data_free_gb": round(data_usage.free / 1024 / 1024 / 1024, 3),
}
except Exception:
return {}
def get_host_immutable_info(host_info: Dict[str, Any]) -> Dict[str, Any]:
return {

View File

@@ -113,23 +113,20 @@ class Machine(ModelWithHealthStats):
Update Machine config from JSON dict.
Args:
record: JSON dict with '_method': 'update', 'key': '...', 'value': '...'
record: JSON dict with 'config': {key: value} patch
overrides: Not used
Returns:
Machine instance or None
"""
method = record.get('_method')
if method == 'update':
key = record.get('key')
value = record.get('value')
if key and value:
machine = Machine.current()
if not machine.config:
machine.config = {}
machine.config[key] = value
machine.save(update_fields=['config'])
return machine
config_patch = record.get('config')
if isinstance(config_patch, dict) and config_patch:
machine = Machine.current()
if not machine.config:
machine.config = {}
machine.config.update(config_patch)
machine.save(update_fields=['config'])
return machine
return None
@@ -458,31 +455,31 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
continue
# Parse JSONL output to check for successful installation
stdout_file = plugin_output_dir / 'stdout.log'
if stdout_file.exists():
stdout = stdout_file.read_text()
for line in stdout.splitlines():
if line.strip() and line.strip().startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'Binary' and record.get('abspath'):
# Update self from successful installation
self.abspath = record['abspath']
self.version = record.get('version', '')
self.sha256 = record.get('sha256', '')
self.binprovider = record.get('binprovider', 'env')
self.status = self.StatusChoices.INSTALLED
self.save()
from archivebox.hooks import extract_records_from_process, process_hook_records
records = extract_records_from_process(process)
if records:
process_hook_records(records, overrides={})
binary_records = [
record for record in records
if record.get('type') == 'Binary' and record.get('abspath')
]
if binary_records:
record = binary_records[0]
# Update self from successful installation
self.abspath = record['abspath']
self.version = record.get('version', '')
self.sha256 = record.get('sha256', '')
self.binprovider = record.get('binprovider', 'env')
self.status = self.StatusChoices.INSTALLED
self.save()
# Symlink binary into LIB_BIN_DIR if configured
from django.conf import settings
lib_bin_dir = getattr(settings, 'LIB_BIN_DIR', None)
if lib_bin_dir:
self.symlink_to_lib_bin(lib_bin_dir)
# Symlink binary into LIB_BIN_DIR if configured
from django.conf import settings
lib_bin_dir = getattr(settings, 'LIB_BIN_DIR', None)
if lib_bin_dir:
self.symlink_to_lib_bin(lib_bin_dir)
return
except json.JSONDecodeError:
continue
return
# No hook succeeded - leave status as QUEUED (will retry later)
# Don't set to FAILED since we don't have that status anymore
@@ -861,6 +858,27 @@ class Process(models.Model):
record['timeout'] = self.timeout
return record
@classmethod
def parse_records_from_text(cls, text: str) -> list[dict]:
"""Parse JSONL records from raw text using the shared JSONL parser."""
from archivebox.misc.jsonl import parse_line
records: list[dict] = []
if not text:
return records
for line in text.splitlines():
record = parse_line(line)
if record and record.get('type'):
records.append(record)
return records
def get_records(self) -> list[dict]:
"""Parse JSONL records from this process's stdout."""
stdout = self.stdout
if not stdout and self.stdout_file and self.stdout_file.exists():
stdout = self.stdout_file.read_text()
return self.parse_records_from_text(stdout or '')
@staticmethod
def from_json(record: dict, overrides: dict = None):
"""
@@ -919,6 +937,7 @@ class Process(models.Model):
if (_CURRENT_PROCESS.pid == current_pid and
_CURRENT_PROCESS.machine_id == machine.id and
timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)):
_CURRENT_PROCESS.ensure_log_files()
return _CURRENT_PROCESS
_CURRENT_PROCESS = None
@@ -945,6 +964,7 @@ class Process(models.Model):
db_start_time = existing.started_at.timestamp()
if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE:
_CURRENT_PROCESS = existing
_CURRENT_PROCESS.ensure_log_files()
return existing
# No valid existing record - create new one
@@ -977,6 +997,7 @@ class Process(models.Model):
started_at=started_at,
status=cls.StatusChoices.RUNNING,
)
_CURRENT_PROCESS.ensure_log_files()
return _CURRENT_PROCESS
@classmethod
@@ -1089,7 +1110,7 @@ class Process(models.Model):
if is_stale:
proc.status = cls.StatusChoices.EXITED
proc.ended_at = proc.ended_at or timezone.now()
proc.exit_code = proc.exit_code if proc.exit_code is not None else -1
proc.exit_code = proc.exit_code if proc.exit_code is not None else 0
proc.save(update_fields=['status', 'ended_at', 'exit_code'])
cleaned += 1
@@ -1209,7 +1230,15 @@ class Process(models.Model):
the actual OS process exists and matches our record.
"""
proc = self.proc
return proc is not None and proc.is_running()
if proc is None:
return False
try:
# Treat zombies as not running (they should be reaped)
if proc.status() == psutil.STATUS_ZOMBIE:
return False
except Exception:
pass
return proc.is_running()
def is_alive(self) -> bool:
"""
@@ -1421,6 +1450,22 @@ class Process(models.Model):
except OSError:
pass
def ensure_log_files(self) -> None:
"""Ensure stdout/stderr log files exist for this process."""
if not self.pwd:
return
try:
Path(self.pwd).mkdir(parents=True, exist_ok=True)
except OSError:
return
try:
if self.stdout_file:
self.stdout_file.touch(exist_ok=True)
if self.stderr_file:
self.stderr_file.touch(exist_ok=True)
except OSError:
return
def _build_env(self) -> dict:
"""Build environment dict for subprocess, merging stored env with system."""
import json
@@ -1507,9 +1552,11 @@ class Process(models.Model):
proc.wait(timeout=self.timeout)
self.exit_code = proc.returncode
except subprocess.TimeoutExpired:
import signal
proc.kill()
proc.wait()
self.exit_code = -1
self.exit_code = 128 + signal.SIGKILL
self.ended_at = timezone.now()
if stdout_path.exists():
@@ -1579,9 +1626,19 @@ class Process(models.Model):
exit_code if exited, None if still running
"""
if self.status == self.StatusChoices.EXITED:
if self.exit_code == -1:
self.exit_code = 137
self.save(update_fields=['exit_code'])
return self.exit_code
if not self.is_running:
# Reap child process if it's a zombie (best-effort)
proc = self.proc
if proc is not None:
try:
proc.wait(timeout=0)
except Exception:
pass
# Process exited - read output and copy to DB
if self.stdout_file and self.stdout_file.exists():
self.stdout = self.stdout_file.read_text()
@@ -1603,7 +1660,9 @@ class Process(models.Model):
# cmd_file.unlink(missing_ok=True)
# Try to get exit code from proc or default to unknown
self.exit_code = self.exit_code if self.exit_code is not None else -1
self.exit_code = self.exit_code if self.exit_code is not None else 0
if self.exit_code == -1:
self.exit_code = 137
self.ended_at = timezone.now()
self.status = self.StatusChoices.EXITED
self.save()
@@ -1723,6 +1782,7 @@ class Process(models.Model):
import os
killed_count = 0
used_sigkill = False
proc = self.proc
if proc is None:
# Already dead
@@ -1772,11 +1832,15 @@ class Process(models.Model):
try:
os.kill(pid, signal.SIGKILL)
killed_count += 1
used_sigkill = True
except (OSError, ProcessLookupError):
pass
# Update self status
self.exit_code = 128 + signal.SIGTERM if killed_count > 0 else 0
if used_sigkill:
self.exit_code = 128 + signal.SIGKILL
else:
self.exit_code = 128 + signal.SIGTERM if killed_count > 0 else 0
self.status = self.StatusChoices.EXITED
self.ended_at = timezone.now()
self.save()
@@ -1925,6 +1989,50 @@ class Process(models.Model):
return 0
@classmethod
def cleanup_orphaned_workers(cls) -> int:
"""
Kill orphaned worker/hook processes whose root process is no longer running.
Orphaned if:
- Root (orchestrator/cli) is not running, or
- No orchestrator/cli ancestor exists.
Standalone worker runs (archivebox run --snapshot-id) are allowed.
"""
killed = 0
running_children = cls.objects.filter(
process_type__in=[cls.TypeChoices.WORKER, cls.TypeChoices.HOOK],
status=cls.StatusChoices.RUNNING,
)
for proc in running_children:
if not proc.is_running:
continue
root = proc.root
# Standalone worker/hook process (run directly)
if root.id == proc.id and root.process_type in (cls.TypeChoices.WORKER, cls.TypeChoices.HOOK):
continue
# If root is an active orchestrator/cli, keep it
if root.process_type in (cls.TypeChoices.ORCHESTRATOR, cls.TypeChoices.CLI) and root.is_running:
continue
try:
if proc.process_type == cls.TypeChoices.HOOK:
proc.kill_tree(graceful_timeout=1.0)
else:
proc.terminate(graceful_timeout=1.0)
killed += 1
except Exception:
continue
if killed:
print(f'[yellow]🧹 Cleaned up {killed} orphaned worker/hook process(es)[/yellow]')
return killed
# =============================================================================
# Binary State Machine
@@ -2126,5 +2234,3 @@ class ProcessMachine(BaseStateMachine, strict_states=True):
# Manually register state machines with python-statemachine registry
registry.register(BinaryMachine)
registry.register(ProcessMachine)

View File

@@ -79,9 +79,9 @@ class TestMachineModel(TestCase):
"""Machine.from_json() should update machine config."""
Machine.current() # Ensure machine exists
record = {
'_method': 'update',
'key': 'WGET_BINARY',
'value': '/usr/bin/wget',
'config': {
'WGET_BINARY': '/usr/bin/wget',
},
}
result = Machine.from_json(record)
@@ -190,12 +190,12 @@ class TestBinaryModel(TestCase):
old_modified = binary.modified_at
binary.update_and_requeue(
status=Binary.StatusChoices.STARTED,
status=Binary.StatusChoices.QUEUED,
retry_at=timezone.now() + timedelta(seconds=60),
)
binary.refresh_from_db()
self.assertEqual(binary.status, Binary.StatusChoices.STARTED)
self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
self.assertGreater(binary.modified_at, old_modified)
@@ -221,12 +221,12 @@ class TestBinaryStateMachine(TestCase):
def test_binary_state_machine_can_start(self):
"""BinaryMachine.can_start() should check name and binproviders."""
sm = BinaryMachine(self.binary)
self.assertTrue(sm.can_start())
self.assertTrue(sm.can_install())
self.binary.binproviders = ''
self.binary.save()
sm = BinaryMachine(self.binary)
self.assertFalse(sm.can_start())
self.assertFalse(sm.can_install())
class TestProcessModel(TestCase):
@@ -415,11 +415,15 @@ class TestProcessLifecycle(TestCase):
def test_process_is_running_current_pid(self):
"""is_running should be True for current PID."""
import psutil
from datetime import datetime
proc_start = datetime.fromtimestamp(psutil.Process(os.getpid()).create_time(), tz=timezone.get_current_timezone())
proc = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.RUNNING,
pid=os.getpid(),
started_at=timezone.now(),
started_at=proc_start,
)
self.assertTrue(proc.is_running)
@@ -450,6 +454,22 @@ class TestProcessLifecycle(TestCase):
proc.refresh_from_db()
self.assertEqual(proc.status, Process.StatusChoices.EXITED)
def test_process_poll_normalizes_negative_exit_code(self):
"""poll() should normalize -1 exit codes to 137."""
proc = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.EXITED,
pid=999999,
exit_code=-1,
started_at=timezone.now(),
)
exit_code = proc.poll()
self.assertEqual(exit_code, 137)
proc.refresh_from_db()
self.assertEqual(proc.exit_code, 137)
def test_process_terminate_dead_process(self):
"""terminate() should handle already-dead process."""
proc = Process.objects.create(

View File

@@ -180,9 +180,11 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
return len(f'file://{socket_file}') <= 96
tmp_is_valid = False
allow_no_unix_sockets = os.environ.get('ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS', '').lower() in ('1', 'true', 'yes')
try:
tmp_is_valid = dir_is_writable(tmp_dir)
tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir)
if not allow_no_unix_sockets:
tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir)
assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}'
assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.'
return True

View File

@@ -3,30 +3,29 @@ Rich Layout-based live progress display for ArchiveBox orchestrator.
Shows a comprehensive dashboard with:
- Top: Crawl queue status (full width)
- Middle: 4-column grid of SnapshotWorker progress panels
- Middle: Running process logs (dynamic panels)
- Bottom: Orchestrator/Daphne logs
"""
__package__ = 'archivebox.misc'
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any
from typing import List, Optional, Any
from collections import deque
from pathlib import Path
from rich import box
from rich.align import Align
from rich.console import Console, Group, RenderableType
from rich.console import Group
from rich.layout import Layout
from rich.columns import Columns
from rich.panel import Panel
from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn, SpinnerColumn
from rich.table import Table
from rich.text import Text
from rich.table import Table
from rich.tree import Tree
from archivebox.config import VERSION
# Maximum number of SnapshotWorker columns to display
MAX_WORKER_COLUMNS = 4
class CrawlQueuePanel:
"""Display crawl queue status across full width."""
@@ -35,6 +34,8 @@ class CrawlQueuePanel:
self.orchestrator_status = "Idle"
self.crawl_queue_count = 0
self.crawl_workers_count = 0
self.binary_queue_count = 0
self.binary_workers_count = 0
self.max_crawl_workers = 8
self.crawl_id: Optional[str] = None
@@ -51,19 +52,27 @@ class CrawlQueuePanel:
left_text.append(f"v{VERSION}", style="bold yellow")
left_text.append(f"{datetime.now(timezone.utc).strftime('%H:%M:%S')}", style="grey53")
# Center-left: Crawl queue status
# Center-left: Crawl + Binary queue status
queue_style = "yellow" if self.crawl_queue_count > 0 else "grey53"
center_left_text = Text()
center_left_text.append("Crawls: ", style="white")
center_left_text.append(str(self.crawl_queue_count), style=f"bold {queue_style}")
center_left_text.append(" queued", style="grey53")
center_left_text.append(" • Binaries: ", style="white")
binary_queue_style = "yellow" if self.binary_queue_count > 0 else "grey53"
center_left_text.append(str(self.binary_queue_count), style=f"bold {binary_queue_style}")
center_left_text.append(" queued", style="grey53")
# Center-right: CrawlWorker status
# Center-right: Worker status
worker_style = "green" if self.crawl_workers_count > 0 else "grey53"
center_right_text = Text()
center_right_text.append("Workers: ", style="white")
center_right_text.append(f"{self.crawl_workers_count}/{self.max_crawl_workers}", style=f"bold {worker_style}")
center_right_text.append(" active", style="grey53")
center_right_text.append(" crawl", style="grey53")
binary_worker_style = "green" if self.binary_workers_count > 0 else "grey53"
center_right_text.append("", style="grey53")
center_right_text.append(str(self.binary_workers_count), style=f"bold {binary_worker_style}")
center_right_text.append(" binary", style="grey53")
# Right: Orchestrator status
status_color = "green" if self.crawl_workers_count > 0 else "grey53"
@@ -74,151 +83,302 @@ class CrawlQueuePanel:
right_text.append(f" [{self.crawl_id[:8]}]", style="grey53")
grid.add_row(left_text, center_left_text, center_right_text, right_text)
return Panel(grid, style="white on blue", box=box.ROUNDED)
return Panel(grid, style="white on blue", box=box.HORIZONTALS)
class SnapshotWorkerPanel:
"""Display progress for a single SnapshotWorker."""
class ProcessLogPanel:
"""Display logs for a running Process."""
def __init__(self, worker_num: int):
self.worker_num = worker_num
self.snapshot_id: Optional[str] = None
self.snapshot_url: Optional[str] = None
self.total_hooks: int = 0
self.completed_hooks: int = 0
self.current_plugin: Optional[str] = None
self.status: str = "idle" # idle, working, completed
self.recent_logs: deque = deque(maxlen=5)
def __init__(self, process: Any, max_lines: int = 8, compact: bool | None = None):
self.process = process
self.max_lines = max_lines
self.compact = compact
def __rich__(self) -> Panel:
if self.status == "idle":
content = Align.center(
Text("Idle", style="grey53"),
vertical="middle",
)
border_style = "grey53"
title_style = "grey53"
else:
# Build progress display
lines = []
is_pending = self._is_pending()
output_line = '' if is_pending else self._output_line()
stdout_lines = []
stderr_lines = []
try:
stdout_lines = list(self.process.tail_stdout(lines=self.max_lines, follow=False))
stderr_lines = list(self.process.tail_stderr(lines=self.max_lines, follow=False))
except Exception:
stdout_lines = []
stderr_lines = []
# URL (truncated)
if self.snapshot_url:
url_display = self.snapshot_url[:35] + "..." if len(self.snapshot_url) > 35 else self.snapshot_url
lines.append(Text(url_display, style="cyan"))
lines.append(Text()) # Spacing
header_lines = []
chrome_launch_line = self._chrome_launch_line(stderr_lines, stdout_lines)
if chrome_launch_line:
header_lines.append(Text(chrome_launch_line, style="grey53"))
if output_line:
header_lines.append(Text(output_line, style="grey53"))
log_lines = []
for line in stdout_lines:
if line:
log_lines.append(Text(line, style="white"))
for line in stderr_lines:
if line:
log_lines.append(Text(line, style="cyan"))
# Progress bar
if self.total_hooks > 0:
pct = (self.completed_hooks / self.total_hooks) * 100
bar_width = 30
filled = int((pct / 100) * bar_width)
bar = "" * filled + "" * (bar_width - filled)
compact = self.compact if self.compact is not None else self._is_background_hook()
max_body = max(1, self.max_lines - len(header_lines))
if not log_lines:
log_lines = []
# Color based on progress
if pct < 30:
bar_style = "yellow"
elif pct < 100:
bar_style = "green"
else:
bar_style = "blue"
lines = header_lines + log_lines[-max_body:]
progress_text = Text()
progress_text.append(bar, style=bar_style)
progress_text.append(f" {pct:.0f}%", style="white")
lines.append(progress_text)
lines.append(Text()) # Spacing
# Stats
stats = Table.grid(padding=(0, 1))
stats.add_column(style="grey53", no_wrap=True)
stats.add_column(style="white")
stats.add_row("Hooks:", f"{self.completed_hooks}/{self.total_hooks}")
if self.current_plugin:
stats.add_row("Current:", Text(self.current_plugin, style="yellow"))
lines.append(stats)
lines.append(Text()) # Spacing
# Recent logs
if self.recent_logs:
lines.append(Text("Recent:", style="grey53"))
for log_msg, log_style in self.recent_logs:
log_text = Text(f"{log_msg[:30]}", style=log_style)
lines.append(log_text)
content = Group(*lines)
border_style = "green" if self.status == "working" else "blue"
title_style = "green" if self.status == "working" else "blue"
content = Group(*lines) if lines else Text("")
title = self._title()
border_style = "grey53" if is_pending else "cyan"
height = 2 if is_pending else None
return Panel(
content,
title=f"[{title_style}]Worker {self.worker_num}",
title=title,
border_style=border_style,
box=box.ROUNDED,
height=20,
box=box.HORIZONTALS,
padding=(0, 1),
height=height,
)
def add_log(self, message: str, style: str = "white"):
"""Add a log message to this worker's recent logs."""
self.recent_logs.append((message, style))
def _title(self) -> str:
process_type = getattr(self.process, 'process_type', 'process')
worker_type = getattr(self.process, 'worker_type', '')
pid = getattr(self.process, 'pid', None)
label = process_type
if process_type == 'worker' and worker_type:
label, worker_suffix = self._worker_label(worker_type)
elif process_type == 'hook':
try:
cmd = getattr(self.process, 'cmd', [])
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
hook_name = hook_path.name if hook_path else 'hook'
plugin_name = hook_path.parent.name if hook_path and hook_path.parent.name else 'hook'
except Exception:
hook_name = 'hook'
plugin_name = 'hook'
label = f"{plugin_name}/{hook_name}"
worker_suffix = ''
else:
worker_suffix = ''
url = self._extract_url()
url_suffix = f" url={self._abbrev_url(url)}" if url else ""
time_suffix = self._elapsed_suffix()
title_style = "grey53" if self._is_pending() else "bold white"
if pid:
return f"[{title_style}]{label}[/{title_style}] [grey53]pid={pid}{worker_suffix}{url_suffix}{time_suffix}[/grey53]"
return f"[{title_style}]{label}[/{title_style}]{f' [grey53]{worker_suffix.strip()} {url_suffix.strip()}{time_suffix}[/grey53]' if (worker_suffix or url_suffix or time_suffix) else ''}".rstrip()
def _is_background_hook(self) -> bool:
if getattr(self.process, 'process_type', '') != 'hook':
return False
try:
cmd = getattr(self.process, 'cmd', [])
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
hook_name = hook_path.name if hook_path else ''
return '.bg.' in hook_name
except Exception:
return False
def _is_pending(self) -> bool:
status = getattr(self.process, 'status', '')
if status in ('queued', 'pending', 'backoff'):
return True
if getattr(self.process, 'process_type', '') == 'hook' and not getattr(self.process, 'pid', None):
return True
return False
def _worker_label(self, worker_type: str) -> tuple[str, str]:
cmd = getattr(self.process, 'cmd', []) or []
if worker_type == 'crawl':
crawl_id = self._extract_arg(cmd, '--crawl-id')
suffix = ''
if crawl_id:
suffix = f" id={str(crawl_id)[-8:]}"
try:
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.filter(id=crawl_id).first()
if crawl:
urls = crawl.get_urls_list()
if urls:
url_list = self._abbrev_urls(urls)
suffix += f" urls={url_list}"
except Exception:
pass
return 'crawl', suffix
if worker_type == 'snapshot':
snapshot_id = self._extract_arg(cmd, '--snapshot-id')
suffix = ''
if snapshot_id:
suffix = f" id={str(snapshot_id)[-8:]}"
try:
from archivebox.core.models import Snapshot
snap = Snapshot.objects.filter(id=snapshot_id).first()
if snap and snap.url:
suffix += f" url={self._abbrev_url(snap.url, max_len=48)}"
except Exception:
pass
return 'snapshot', suffix
return f"worker:{worker_type}", ''
@staticmethod
def _extract_arg(cmd: list[str], key: str) -> str | None:
for i, part in enumerate(cmd):
if part.startswith(f'{key}='):
return part.split('=', 1)[1]
if part == key and i + 1 < len(cmd):
return cmd[i + 1]
return None
def _abbrev_urls(self, urls: list[str], max_len: int = 48) -> str:
if not urls:
return ''
if len(urls) == 1:
return self._abbrev_url(urls[0], max_len=max_len)
first = self._abbrev_url(urls[0], max_len=max_len)
return f"{first},+{len(urls) - 1}"
def _extract_url(self) -> str:
url = getattr(self.process, 'url', None)
if url:
return str(url)
cmd = getattr(self.process, 'cmd', []) or []
for i, part in enumerate(cmd):
if part.startswith('--url='):
return part.split('=', 1)[1].strip()
if part == '--url' and i + 1 < len(cmd):
return str(cmd[i + 1]).strip()
return ''
def _abbrev_url(self, url: str, max_len: int = 48) -> str:
if not url:
return ''
if len(url) <= max_len:
return url
return f"{url[:max_len - 3]}..."
def _chrome_launch_line(self, stderr_lines: list[str], stdout_lines: list[str]) -> str:
try:
cmd = getattr(self.process, 'cmd', [])
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
hook_name = hook_path.name if hook_path else ''
if 'chrome_launch' not in hook_name:
return ''
pid = ''
ws = ''
for line in stderr_lines + stdout_lines:
if not ws and 'CDP URL:' in line:
ws = line.split('CDP URL:', 1)[1].strip()
if not pid and 'PID:' in line:
pid = line.split('PID:', 1)[1].strip()
if pid and ws:
return f"Chrome pid={pid} {ws}"
if ws:
return f"Chrome {ws}"
if pid:
return f"Chrome pid={pid}"
try:
from archivebox import DATA_DIR
base = Path(DATA_DIR)
pwd = getattr(self.process, 'pwd', None)
if pwd:
chrome_dir = Path(pwd)
if not chrome_dir.is_absolute():
chrome_dir = (base / chrome_dir).resolve()
cdp_file = chrome_dir / 'cdp_url.txt'
pid_file = chrome_dir / 'chrome.pid'
if cdp_file.exists():
ws = cdp_file.read_text().strip()
if pid_file.exists():
pid = pid_file.read_text().strip()
if pid and ws:
return f"Chrome pid={pid} {ws}"
if ws:
return f"Chrome {ws}"
if pid:
return f"Chrome pid={pid}"
except Exception:
pass
except Exception:
return ''
return ''
def _elapsed_suffix(self) -> str:
started_at = getattr(self.process, 'started_at', None)
timeout = getattr(self.process, 'timeout', None)
if not started_at or not timeout:
return ''
try:
now = datetime.now(timezone.utc) if started_at.tzinfo else datetime.now()
elapsed = int((now - started_at).total_seconds())
elapsed = max(elapsed, 0)
return f" [{elapsed}/{int(timeout)}s]"
except Exception:
return ''
def _output_line(self) -> str:
pwd = getattr(self.process, 'pwd', None)
if not pwd:
return ''
try:
from archivebox import DATA_DIR
rel = Path(pwd)
base = Path(DATA_DIR)
if rel.is_absolute():
try:
rel = rel.relative_to(base)
except Exception:
pass
rel_str = f"./{rel}" if not str(rel).startswith("./") else str(rel)
return f"{rel_str}"
except Exception:
return f"{pwd}"
class CrawlWorkerLogPanel:
"""Display CrawlWorker logs by tailing stdout/stderr from Process."""
class WorkerLogPanel:
"""Display worker logs by tailing stdout/stderr from Process."""
def __init__(self, max_lines: int = 8):
def __init__(self, title: str, empty_message: str, running_message: str, max_lines: int = 8):
self.title = title
self.empty_message = empty_message
self.running_message = running_message
self.log_lines: deque = deque(maxlen=max_lines * 2) # Allow more buffer
self.max_lines = max_lines
self.last_stdout_pos = 0 # Track file position for efficient tailing
self.last_stderr_pos = 0
self.last_process_running = False
def update_from_process(self, process: Any):
"""Update logs by tailing the Process stdout/stderr files."""
from pathlib import Path
if not process:
self.last_process_running = False
return
# Read new stdout lines since last read
# Use Process tail helpers for consistency
try:
stdout_path = Path(process.stdout)
if stdout_path.exists():
with open(stdout_path, 'r') as f:
# Seek to last read position
f.seek(self.last_stdout_pos)
new_lines = f.readlines()
# Update position
self.last_stdout_pos = f.tell()
# Add new lines (up to max_lines to avoid overflow)
for line in new_lines[-self.max_lines:]:
line = line.rstrip('\n')
if line and not line.startswith('['): # Skip Rich markup lines
self.log_lines.append(('stdout', line))
self.last_process_running = bool(getattr(process, 'is_running', False))
stdout_lines = list(process.tail_stdout(lines=self.max_lines, follow=False))
stderr_lines = list(process.tail_stderr(lines=self.max_lines, follow=False))
except Exception:
pass
return
# Read new stderr lines since last read
try:
stderr_path = Path(process.stderr)
if stderr_path.exists():
with open(stderr_path, 'r') as f:
f.seek(self.last_stderr_pos)
new_lines = f.readlines()
self.log_lines.clear()
self.last_stderr_pos = f.tell()
for line in new_lines[-self.max_lines:]:
line = line.rstrip('\n')
if line and not line.startswith('['): # Skip Rich markup lines
self.log_lines.append(('stderr', line))
except Exception:
pass
# Preserve ordering by showing stdout then stderr
for line in stdout_lines:
if line:
self.log_lines.append(('stdout', line))
for line in stderr_lines:
if line:
self.log_lines.append(('stderr', line))
def __rich__(self) -> Panel:
if not self.log_lines:
content = Text("No CrawlWorker logs yet", style="grey53", justify="center")
message = self.running_message if self.last_process_running else self.empty_message
content = Text(message, style="grey53", justify="center")
else:
# Get the last max_lines for display
display_lines = list(self.log_lines)[-self.max_lines:]
@@ -236,9 +396,9 @@ class CrawlWorkerLogPanel:
return Panel(
content,
title="[bold cyan]CrawlWorker Logs (stdout/stderr)",
title=f"[bold cyan]{self.title}",
border_style="cyan",
box=box.ROUNDED,
box=box.HORIZONTALS,
)
@@ -270,10 +430,71 @@ class OrchestratorLogPanel:
content,
title="[bold white]Orchestrator / Daphne Logs",
border_style="white",
box=box.ROUNDED,
box=box.HORIZONTALS,
)
class CrawlQueueTreePanel:
"""Display crawl queue with snapshots + hook summary in a tree view."""
def __init__(self, max_crawls: int = 8, max_snapshots: int = 16):
self.crawls: list[dict[str, Any]] = []
self.max_crawls = max_crawls
self.max_snapshots = max_snapshots
def update_crawls(self, crawls: list[dict[str, Any]]) -> None:
"""Update crawl tree data."""
self.crawls = crawls[:self.max_crawls]
def __rich__(self) -> Panel:
if not self.crawls:
content = Text("No active crawls", style="grey53", justify="center")
else:
trees = []
for crawl in self.crawls:
crawl_status = crawl.get('status', '')
crawl_label = crawl.get('label', '')
crawl_id = crawl.get('id', '')[:8]
crawl_text = Text(f"{self._status_icon(crawl_status)} {crawl_id} {crawl_label}", style="white")
crawl_tree = Tree(crawl_text, guide_style="grey53")
snapshots = crawl.get('snapshots', [])[:self.max_snapshots]
for snap in snapshots:
snap_status = snap.get('status', '')
snap_label = snap.get('label', '')
snap_text = Text(f"{self._status_icon(snap_status)} {snap_label}", style="white")
snap_node = crawl_tree.add(snap_text)
hooks = snap.get('hooks', {})
if hooks:
completed = hooks.get('completed', 0)
running = hooks.get('running', 0)
pending = hooks.get('pending', 0)
summary = f"{completed} | ▶️ {running} | ⌛️ {pending}"
snap_node.add(Text(summary, style="grey53"))
trees.append(crawl_tree)
content = Group(*trees)
return Panel(
content,
title="[bold white]Crawl Queue",
border_style="white",
box=box.HORIZONTALS,
)
@staticmethod
def _status_icon(status: str) -> str:
if status in ('queued', 'pending'):
return ''
if status in ('started', 'running'):
return ''
if status in ('sealed', 'done', 'completed'):
return ''
if status in ('failed', 'error'):
return ''
return ''
class ArchiveBoxProgressLayout:
"""
Main layout manager for ArchiveBox orchestrator progress display.
@@ -281,15 +502,8 @@ class ArchiveBoxProgressLayout:
Layout structure:
┌─────────────────────────────────────────────────────────────┐
│ Crawl Queue (full width) │
├──────────────────────────────────────────────────────────┤
Snapshot Snapshot │ Snapshot Snapshot
│ Worker 1 │ Worker 2 │ Worker 3 │ Worker 4 │
│ │ │ │ │
│ Progress + │ Progress + │ Progress + │ Progress + │
│ Stats + │ Stats + │ Stats + │ Stats + │
│ Logs │ Logs │ Logs │ Logs │
├───────────────┴───────────────┴───────────────┴─────────────┤
│ CrawlWorker Logs (stdout/stderr) │
├─────────────────────────────────────────────────────────────┤
Running Process Logs (dynamic panels)
├─────────────────────────────────────────────────────────────┤
│ Orchestrator / Daphne Logs │
└─────────────────────────────────────────────────────────────┘
@@ -303,51 +517,33 @@ class ArchiveBoxProgressLayout:
self.crawl_queue = CrawlQueuePanel()
self.crawl_queue.crawl_id = crawl_id
# Create 4 worker panels
self.worker_panels = [SnapshotWorkerPanel(i + 1) for i in range(MAX_WORKER_COLUMNS)]
self.crawl_worker_log = CrawlWorkerLogPanel(max_lines=8)
self.process_panels: List[ProcessLogPanel] = []
self.orchestrator_log = OrchestratorLogPanel(max_events=8)
self.crawl_queue_tree = CrawlQueueTreePanel(max_crawls=8, max_snapshots=16)
# Create layout
self.layout = self._make_layout()
# Track snapshot ID to worker panel mapping
self.snapshot_to_worker: Dict[str, int] = {} # snapshot_id -> worker_panel_index
def _make_layout(self) -> Layout:
"""Define the layout structure."""
layout = Layout(name="root")
# Top-level split: crawl_queue, workers, logs
# Top-level split: crawl_queue, workers, bottom
layout.split(
Layout(name="crawl_queue", size=3),
Layout(name="workers", ratio=1),
Layout(name="logs", size=20),
)
# Split workers into 4 columns
layout["workers"].split_row(
Layout(name="worker1"),
Layout(name="worker2"),
Layout(name="worker3"),
Layout(name="worker4"),
)
# Split logs into crawl_worker_logs and orchestrator_logs
layout["logs"].split(
Layout(name="crawl_worker_logs", size=10),
Layout(name="orchestrator_logs", size=10),
Layout(name="processes", ratio=1),
Layout(name="bottom", size=12),
)
# Assign components to layout sections
layout["crawl_queue"].update(self.crawl_queue)
layout["worker1"].update(self.worker_panels[0])
layout["worker2"].update(self.worker_panels[1])
layout["worker3"].update(self.worker_panels[2])
layout["worker4"].update(self.worker_panels[3])
layout["crawl_worker_logs"].update(self.crawl_worker_log)
layout["processes"].update(Columns([]))
layout["bottom"].split_row(
Layout(name="orchestrator_logs", ratio=2),
Layout(name="crawl_tree", ratio=1),
)
layout["orchestrator_logs"].update(self.orchestrator_log)
layout["crawl_tree"].update(self.crawl_queue_tree)
return layout
@@ -356,82 +552,53 @@ class ArchiveBoxProgressLayout:
status: str,
crawl_queue_count: int = 0,
crawl_workers_count: int = 0,
binary_queue_count: int = 0,
binary_workers_count: int = 0,
max_crawl_workers: int = 8,
):
"""Update orchestrator status in the crawl queue panel."""
self.crawl_queue.orchestrator_status = status
self.crawl_queue.crawl_queue_count = crawl_queue_count
self.crawl_queue.crawl_workers_count = crawl_workers_count
self.crawl_queue.binary_queue_count = binary_queue_count
self.crawl_queue.binary_workers_count = binary_workers_count
self.crawl_queue.max_crawl_workers = max_crawl_workers
def update_snapshot_worker(
self,
snapshot_id: str,
url: str,
total: int,
completed: int,
current_plugin: str = "",
):
"""Update or assign a snapshot to a worker panel."""
# Find or assign worker panel for this snapshot
if snapshot_id not in self.snapshot_to_worker:
# Find first idle worker panel
worker_idx = None
for idx, panel in enumerate(self.worker_panels):
if panel.status == "idle":
worker_idx = idx
break
def update_process_panels(self, processes: List[Any], pending: Optional[List[Any]] = None) -> None:
"""Update process panels to show all running processes."""
panels = []
all_processes = list(processes) + list(pending or [])
for process in all_processes:
is_hook = getattr(process, 'process_type', '') == 'hook'
is_bg = False
if is_hook:
try:
cmd = getattr(process, 'cmd', [])
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
hook_name = hook_path.name if hook_path else ''
is_bg = '.bg.' in hook_name
except Exception:
is_bg = False
is_pending = getattr(process, 'status', '') in ('queued', 'pending', 'backoff') or (is_hook and not getattr(process, 'pid', None))
max_lines = 2 if is_pending else (4 if is_bg else 7)
panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg))
if not panels:
self.layout["processes"].size = 0
self.layout["processes"].update(Text(""))
return
# If no idle worker, use round-robin (shouldn't happen often)
if worker_idx is None:
worker_idx = len(self.snapshot_to_worker) % MAX_WORKER_COLUMNS
self.layout["processes"].size = None
self.layout["processes"].ratio = 1
self.layout["processes"].update(Columns(panels, equal=True, expand=True))
self.snapshot_to_worker[snapshot_id] = worker_idx
def update_crawl_tree(self, crawls: list[dict[str, Any]]) -> None:
"""Update the crawl queue tree panel."""
self.crawl_queue_tree.update_crawls(crawls)
# Get assigned worker panel
worker_idx = self.snapshot_to_worker[snapshot_id]
panel = self.worker_panels[worker_idx]
# Update panel
panel.snapshot_id = snapshot_id
panel.snapshot_url = url
panel.total_hooks = total
panel.completed_hooks = completed
panel.current_plugin = current_plugin
panel.status = "working" if completed < total else "completed"
def remove_snapshot_worker(self, snapshot_id: str):
"""Mark a snapshot worker as idle after completion."""
if snapshot_id in self.snapshot_to_worker:
worker_idx = self.snapshot_to_worker[snapshot_id]
panel = self.worker_panels[worker_idx]
# Mark as idle
panel.status = "idle"
panel.snapshot_id = None
panel.snapshot_url = None
panel.total_hooks = 0
panel.completed_hooks = 0
panel.current_plugin = None
panel.recent_logs.clear()
# Remove mapping
del self.snapshot_to_worker[snapshot_id]
def log_to_worker(self, snapshot_id: str, message: str, style: str = "white"):
"""Add a log message to a specific worker's panel."""
if snapshot_id in self.snapshot_to_worker:
worker_idx = self.snapshot_to_worker[snapshot_id]
self.worker_panels[worker_idx].add_log(message, style)
def log_event(self, message: str, style: str = "white"):
def log_event(self, message: str, style: str = "white") -> None:
"""Add an event to the orchestrator log."""
self.orchestrator_log.add_event(message, style)
def update_crawl_worker_logs(self, process: Any):
"""Update CrawlWorker logs by tailing the Process stdout/stderr files."""
self.crawl_worker_log.update_from_process(process)
def get_layout(self) -> Layout:
"""Get the Rich Layout object for rendering."""
return self.layout

View File

@@ -0,0 +1 @@
<span class="abx-output-icon abx-output-icon--accessibility" title="Accessibility"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="4.5" r="2" fill="currentColor" stroke="none"/><path d="M4 7.5h16"/><path d="M12 7.5v12"/><path d="M7 20l5-6 5 6"/></svg></span>

View File

@@ -1 +0,0 @@
"""Tests for the accessibility plugin."""

View File

@@ -10,7 +10,7 @@ import json
import sys
import rich_click as click
from abx_pkg import Binary, AptProvider
from abx_pkg import Binary, AptProvider, BinProviderOverrides
# Fix pydantic forward reference issue
AptProvider.model_rebuild()

View File

@@ -1 +0,0 @@
"""Tests for the apt binary provider plugin."""

View File

@@ -21,7 +21,7 @@ from django.test import TestCase
# Get the path to the apt provider hook
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_apt_provider.py'
INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_apt_install.py'), None)
def apt_available() -> bool:
@@ -48,7 +48,7 @@ class TestAptProviderHook(TestCase):
def test_hook_script_exists(self):
"""Hook script should exist."""
self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
def test_hook_skips_when_apt_not_allowed(self):
"""Hook should skip when apt not in allowed binproviders."""

View File

@@ -47,6 +47,9 @@ def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
def log(message: str) -> None:
print(f'[archivedotorg] {message}', file=sys.stderr)
try:
import requests
except ImportError:
@@ -56,6 +59,8 @@ def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
submit_url = f'https://web.archive.org/save/{url}'
log(f'Submitting to Wayback Machine (timeout={timeout}s)')
log(f'GET {submit_url}')
try:
response = requests.get(
@@ -64,31 +69,40 @@ def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
headers={'User-Agent': user_agent},
allow_redirects=True,
)
log(f'HTTP {response.status_code} final_url={response.url}')
# Check for successful archive
content_location = response.headers.get('Content-Location', '')
x_archive_orig_url = response.headers.get('X-Archive-Orig-Url', '')
if content_location:
log(f'Content-Location: {content_location}')
if x_archive_orig_url:
log(f'X-Archive-Orig-Url: {x_archive_orig_url}')
# Build archive URL
if content_location:
archive_url = f'https://web.archive.org{content_location}'
Path(OUTPUT_FILE).write_text(archive_url, encoding='utf-8')
log(f'Saved archive URL -> {archive_url}')
return True, OUTPUT_FILE, ''
elif 'web.archive.org' in response.url:
# We were redirected to an archive page
Path(OUTPUT_FILE).write_text(response.url, encoding='utf-8')
log(f'Redirected to archive page -> {response.url}')
return True, OUTPUT_FILE, ''
else:
# Check for errors in response
if 'RobotAccessControlException' in response.text:
# Blocked by robots.txt - save submit URL for manual retry
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
log('Blocked by robots.txt, saved submit URL for manual retry')
return True, OUTPUT_FILE, '' # Consider this a soft success
elif response.status_code >= 400:
return False, None, f'HTTP {response.status_code}'
else:
# Save submit URL anyway
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
log('No archive URL returned, saved submit URL for manual retry')
return True, OUTPUT_FILE, ''
except requests.Timeout:

View File

@@ -1 +1 @@
🏛️
<span class="abx-output-icon abx-output-icon--archivedotorg" title="Archive.org"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M3 7h18"/><rect x="3" y="7" width="18" height="13" rx="2"/><path d="M9 12h6"/></svg></span>

View File

@@ -1 +0,0 @@
{"type": "Binary", "name": "chrome", "binproviders": "npm,env,brew,apt", "overrides": {"npm": {"packages": ["@puppeteer/browsers"]}}}

View File

@@ -1253,7 +1253,7 @@ function getExtensionTargets(browser) {
}
/**
* Find Chromium/Chrome binary path.
* Find Chromium binary path.
* Checks CHROME_BINARY env var first, then falls back to system locations.
*
* @returns {string|null} - Absolute path to browser binary or null if not found
@@ -1276,7 +1276,9 @@ function findChromium() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary) {
const absPath = path.resolve(chromeBinary);
if (validateBinary(absPath)) {
if (absPath.includes('Google Chrome') || absPath.includes('google-chrome')) {
console.error('[!] Warning: CHROME_BINARY points to Chrome. Chromium is required for extension support.');
} else if (validateBinary(absPath)) {
return absPath;
}
console.error(`[!] Warning: CHROME_BINARY="${chromeBinary}" is not valid`);
@@ -1309,7 +1311,7 @@ function findChromium() {
return null;
};
// 3. Search fallback locations (Chromium first, then Chrome)
// 3. Search fallback locations (Chromium only)
const fallbackLocations = [
// System Chromium
'/Applications/Chromium.app/Contents/MacOS/Chromium',
@@ -1318,10 +1320,6 @@ function findChromium() {
// Puppeteer cache
path.join(process.env.HOME || '', '.cache/puppeteer/chromium'),
path.join(process.env.HOME || '', '.cache/puppeteer'),
// Chrome (fallback - extensions may not work in 137+)
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
];
for (const loc of fallbackLocations) {
@@ -1332,9 +1330,6 @@ function findChromium() {
return binary;
}
} else if (validateBinary(loc)) {
if (loc.includes('Google Chrome') || loc.includes('google-chrome')) {
console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+');
}
return loc;
}
}
@@ -1699,10 +1694,10 @@ module.exports = {
// Chrome launching
launchChromium,
killChrome,
// Chrome/Chromium install
// Chromium install
installChromium,
installPuppeteerCore,
// Chrome/Chromium binary finding
// Chromium binary finding
findChromium,
// Extension utilities
getExtensionId,
@@ -1744,7 +1739,7 @@ if (require.main === module) {
console.log('Usage: chrome_utils.js <command> [args...]');
console.log('');
console.log('Commands:');
console.log(' findChromium Find Chrome/Chromium binary');
console.log(' findChromium Find Chromium binary');
console.log(' installChromium Install Chromium via @puppeteer/browsers');
console.log(' installPuppeteerCore Install puppeteer-core npm package');
console.log(' launchChromium Launch Chrome with CDP debugging');

View File

@@ -7,13 +7,13 @@
"type": "boolean",
"default": true,
"x-aliases": ["USE_CHROME"],
"description": "Enable Chrome/Chromium browser integration for archiving"
"description": "Enable Chromium browser integration for archiving"
},
"CHROME_BINARY": {
"type": "string",
"default": "chromium",
"x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"],
"description": "Path to Chrome/Chromium binary"
"description": "Path to Chromium binary"
},
"CHROME_NODE_BINARY": {
"type": "string",

View File

@@ -1,265 +0,0 @@
#!/usr/bin/env python3
"""
Install hook for Chrome/Chromium and puppeteer-core.
Runs at crawl start to install/find Chromium and puppeteer-core.
Also validates config and computes derived values.
Outputs:
- JSONL for Binary and Machine config updates
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
Respects CHROME_BINARY env var for custom binary paths.
Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
--load-extension and --disable-extensions-except flags, which are needed for
loading unpacked extensions in headless mode.
"""
import os
import sys
import json
import subprocess
from pathlib import Path
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def detect_docker() -> bool:
"""Detect if running inside Docker container."""
return (
os.path.exists('/.dockerenv') or
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
os.path.exists('/run/.containerenv')
)
def get_chrome_version(binary_path: str) -> str | None:
"""Get Chrome/Chromium version string."""
try:
result = subprocess.run(
[binary_path, '--version'],
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def install_puppeteer_core() -> bool:
"""Install puppeteer-core to NODE_MODULES_DIR if not present."""
node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip()
if not node_modules_dir:
# No isolated node_modules, skip (will use global)
return True
node_modules_path = Path(node_modules_dir)
if (node_modules_path / 'puppeteer-core').exists():
return True
# Get npm prefix from NODE_MODULES_DIR (parent of node_modules)
npm_prefix = node_modules_path.parent
try:
print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr)
result = subprocess.run(
['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'],
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0:
print(f"[+] puppeteer-core installed", file=sys.stderr)
return True
else:
print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr)
return False
except Exception as e:
print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr)
return False
def install_chromium() -> dict | None:
"""Install Chromium using @puppeteer/browsers and parse output for binary path.
Output format: "chromium@<version> <path_to_binary>"
e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium"
Note: npx is fast when chromium is already cached - it returns the path without re-downloading.
"""
try:
print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr)
# Use --path to install to puppeteer's standard cache location
cache_path = os.path.expanduser('~/.cache/puppeteer')
result = subprocess.run(
['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'],
capture_output=True,
text=True,
stdin=subprocess.DEVNULL,
timeout=300
)
if result.returncode != 0:
print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr)
return None
# Parse output: "chromium@1563294 /path/to/Chromium"
output = result.stdout.strip()
parts = output.split(' ', 1)
if len(parts) != 2:
print(f"[!] Failed to parse install output: {output}", file=sys.stderr)
return None
version_str = parts[0] # "chromium@1563294"
binary_path = parts[1].strip()
if not binary_path or not os.path.exists(binary_path):
print(f"[!] Binary not found at: {binary_path}", file=sys.stderr)
return None
# Extract version number
version = version_str.split('@')[1] if '@' in version_str else None
print(f"[+] Chromium installed: {binary_path}", file=sys.stderr)
return {
'name': 'chromium',
'abspath': binary_path,
'version': version,
'binprovider': 'puppeteer',
}
except subprocess.TimeoutExpired:
print("[!] Chromium install timed out", file=sys.stderr)
except FileNotFoundError:
print("[!] npx not found - is Node.js installed?", file=sys.stderr)
except Exception as e:
print(f"[!] Failed to install Chromium: {e}", file=sys.stderr)
return None
def main():
warnings = []
errors = []
computed = {}
# Install puppeteer-core if NODE_MODULES_DIR is set
install_puppeteer_core()
# Check if Chrome is enabled
chrome_enabled = get_env_bool('CHROME_ENABLED', True)
# Detect Docker and adjust sandbox
in_docker = detect_docker()
computed['IN_DOCKER'] = str(in_docker).lower()
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
if in_docker and chrome_sandbox:
warnings.append(
"Running in Docker with CHROME_SANDBOX=true. "
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
)
# Auto-disable sandbox in Docker unless explicitly set
if not get_env('CHROME_SANDBOX'):
computed['CHROME_SANDBOX'] = 'false'
# Check Node.js availability
node_binary = get_env('NODE_BINARY', 'node')
computed['NODE_BINARY'] = node_binary
# Check if CHROME_BINARY is already set and valid
configured_binary = get_env('CHROME_BINARY', '')
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
version = get_chrome_version(configured_binary)
computed['CHROME_BINARY'] = configured_binary
computed['CHROME_VERSION'] = version or 'unknown'
print(json.dumps({
'type': 'Binary',
'name': 'chromium',
'abspath': configured_binary,
'version': version,
'binprovider': 'env',
}))
# Output computed values
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
sys.exit(0)
# Install/find Chromium via puppeteer
result = install_chromium()
if result and result.get('abspath'):
computed['CHROME_BINARY'] = result['abspath']
computed['CHROME_VERSION'] = result['version'] or 'unknown'
print(json.dumps({
'type': 'Binary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/CHROME_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/CHROMIUM_VERSION',
'value': result['version'],
}))
# Output computed values
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
sys.exit(0)
else:
errors.append("Chromium binary not found")
computed['CHROME_BINARY'] = ''
# Output computed values and errors
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
for error in errors:
print(f"ERROR:{error}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python3
"""
Emit Chromium Binary dependency for the crawl.
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
--load-extension and --disable-extensions-except flags, which are needed for
loading unpacked extensions in headless mode.
"""
import json
import os
import sys
def main():
# Check if Chrome is enabled
chrome_enabled = os.environ.get('CHROME_ENABLED', 'true').lower() not in ('false', '0', 'no', 'off')
if not chrome_enabled:
sys.exit(0)
record = {
'type': 'Binary',
'name': 'chromium',
'binproviders': 'puppeteer,env',
'overrides': {
'puppeteer': ['chromium@latest', '--install-deps'],
},
}
print(json.dumps(record))
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -3,12 +3,12 @@
* Launch a shared Chromium browser session for the entire crawl.
*
* This runs once per crawl and keeps Chromium alive for all snapshots to share.
* Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js.
* Each snapshot creates its own tab via on_Snapshot__10_chrome_tab.bg.js.
*
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
* --load-extension and --disable-extensions-except flags.
*
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Usage: on_Crawl__90_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Output: Writes to current directory (executor creates chrome/ dir):
* - cdp_url.txt: WebSocket URL for CDP connection
* - chrome.pid: Chromium process ID (for cleanup)
@@ -31,7 +31,7 @@ if (process.env.NODE_MODULES_DIR) {
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
const puppeteer = require('puppeteer');
const {
findChromium,
launchChromium,

View File

@@ -2,11 +2,11 @@
/**
* Create a Chrome tab for this snapshot in the shared crawl Chrome session.
*
* If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js),
* If a crawl-level Chrome session exists (from on_Crawl__90_chrome_launch.bg.js),
* this connects to it and creates a new tab. Otherwise, falls back to launching
* its own Chrome instance.
*
* Usage: on_Snapshot__20_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> --crawl-id=<uuid>
* Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> --crawl-id=<uuid>
* Output: Creates chrome/ directory under snapshot output dir with:
* - cdp_url.txt: WebSocket URL for CDP connection
* - chrome.pid: Chrome process ID (from crawl)
@@ -15,11 +15,14 @@
*
* Environment variables:
* CRAWL_OUTPUT_DIR: Crawl output directory (to find crawl's Chrome session)
* CHROME_BINARY: Path to Chrome/Chromium binary (for fallback)
* CHROME_BINARY: Path to Chromium binary (for fallback)
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
*
* This is a background hook that stays alive until SIGTERM so the tab
* can be closed cleanly at the end of the snapshot run.
*/
const fs = require('fs');
@@ -28,7 +31,7 @@ const { spawn } = require('child_process');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const puppeteer = require('puppeteer');
const {
findChromium,
getEnv,
@@ -43,6 +46,11 @@ const PLUGIN_NAME = 'chrome_tab';
const OUTPUT_DIR = '.'; // Hook already runs in chrome/ output directory
const CHROME_SESSION_DIR = '.';
let finalStatus = 'failed';
let finalOutput = '';
let finalError = '';
let cmdVersion = '';
let finalized = false;
// Parse command line arguments
function parseArgs() {
@@ -56,8 +64,31 @@ function parseArgs() {
return args;
}
function emitResult(statusOverride) {
if (finalized) return;
finalized = true;
const status = statusOverride || finalStatus;
const outputStr = status === 'succeeded'
? finalOutput
: (finalError || finalOutput || '');
const result = {
type: 'ArchiveResult',
status,
output_str: outputStr,
};
if (cmdVersion) {
result.cmd_version = cmdVersion;
}
console.log(JSON.stringify(result));
}
// Cleanup handler for SIGTERM - close this snapshot's tab
async function cleanup() {
async function cleanup(signal) {
if (signal) {
console.error(`\nReceived ${signal}, closing chrome tab...`);
}
try {
const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt');
const targetIdFile = path.join(OUTPUT_DIR, 'target_id.txt');
@@ -78,12 +109,13 @@ async function cleanup() {
} catch (e) {
// Best effort
}
process.exit(0);
emitResult();
process.exit(finalStatus === 'succeeded' ? 0 : 1);
}
// Register signal handlers
process.on('SIGTERM', cleanup);
process.on('SIGINT', cleanup);
process.on('SIGTERM', () => cleanup('SIGTERM'));
process.on('SIGINT', () => cleanup('SIGINT'));
// Try to find the crawl's Chrome session
function findCrawlChromeSession(crawlId) {
@@ -272,23 +304,22 @@ async function main() {
const crawlId = args.crawl_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__20_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]');
console.error('Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let output = '';
let error = '';
let version = '';
try {
const binary = findChromium();
if (!binary) {
console.error('ERROR: Chrome/Chromium binary not found');
console.error('DEPENDENCY_NEEDED=chrome');
console.error('ERROR: Chromium binary not found');
console.error('DEPENDENCY_NEEDED=chromium');
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable');
console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
process.exit(1);
}
@@ -327,24 +358,22 @@ async function main() {
status = 'failed';
}
const endTs = new Date();
if (error) {
console.error(`ERROR: ${error}`);
}
// Output clean JSONL (no RESULT_JSON= prefix)
const result = {
type: 'ArchiveResult',
status,
output_str: output || error || '',
};
if (version) {
result.cmd_version = version;
}
console.log(JSON.stringify(result));
finalStatus = status;
finalOutput = output || '';
finalError = error || '';
cmdVersion = version || '';
process.exit(status === 'succeeded' ? 0 : 1);
if (status !== 'succeeded') {
emitResult(status);
process.exit(1);
}
console.log('[*] Chrome tab created, waiting for cleanup signal...');
await new Promise(() => {}); // Keep alive until SIGTERM
}
main().catch(e => {

View File

@@ -0,0 +1,76 @@
#!/usr/bin/env node
/**
* Wait for Chrome session files to exist (cdp_url.txt + target_id.txt).
*
* This is a foreground hook that blocks until the Chrome tab is ready,
* so downstream hooks can safely connect to CDP.
*
* Usage: on_Snapshot__11_chrome_wait.js --url=<url> --snapshot-id=<uuid>
*/
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const {
getEnvInt,
waitForChromeSession,
readCdpUrl,
readTargetId,
} = require('./chrome_utils.js');
const CHROME_SESSION_DIR = '.';
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__11_chrome_wait.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60)));
const timeoutMs = timeoutSeconds * 1000;
console.error(`[chrome_wait] Waiting for Chrome session (timeout=${timeoutSeconds}s)...`);
const ready = await waitForChromeSession(CHROME_SESSION_DIR, timeoutMs);
if (!ready) {
const error = `Chrome session not ready after ${timeoutSeconds}s (cdp_url.txt/target_id.txt missing)`;
console.error(`[chrome_wait] ERROR: ${error}`);
console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
process.exit(1);
}
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
const targetId = readTargetId(CHROME_SESSION_DIR);
if (!cdpUrl || !targetId) {
const error = 'Chrome session files incomplete (cdp_url.txt/target_id.txt missing)';
console.error(`[chrome_wait] ERROR: ${error}`);
console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
process.exit(1);
}
console.error(`[chrome_wait] Chrome session ready (cdp_url=${cdpUrl.slice(0, 32)}..., target_id=${targetId}).`);
console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', output_str: 'chrome session ready' }));
process.exit(0);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -19,7 +19,7 @@ const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const puppeteer = require('puppeteer');
const PLUGIN_NAME = 'chrome_navigate';
const CHROME_SESSION_DIR = '.';

View File

@@ -0,0 +1 @@
<span class="abx-output-icon abx-output-icon--chrome" title="Chrome"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="4.5" width="18" height="15" rx="2"/><path d="M3 9h18"/><circle cx="7" cy="7" r="1" fill="currentColor" stroke="none"/><circle cx="11" cy="7" r="1" fill="currentColor" stroke="none"/></svg></span>

View File

@@ -60,6 +60,7 @@ import os
import platform
import signal
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path
@@ -72,11 +73,14 @@ CHROME_PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent
# Hook script locations
CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__01_chrome_install.py'
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__70_chrome_install.py'
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__90_chrome_launch.bg.js'
CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__10_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
PUPPETEER_BINARY_HOOK = PLUGINS_ROOT / 'puppeteer' / 'on_Binary__12_puppeteer_install.py'
PUPPETEER_CRAWL_HOOK = PLUGINS_ROOT / 'puppeteer' / 'on_Crawl__60_puppeteer_install.py'
NPM_BINARY_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__10_npm_install.py'
# =============================================================================
@@ -402,7 +406,7 @@ def run_hook(
# Determine interpreter based on file extension
if hook_script.suffix == '.py':
cmd = ['python', str(hook_script)]
cmd = [sys.executable, str(hook_script)]
elif hook_script.suffix == '.js':
cmd = ['node', str(hook_script)]
else:
@@ -451,6 +455,128 @@ def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optio
return None
def parse_jsonl_records(stdout: str) -> List[Dict[str, Any]]:
"""Parse all JSONL records from stdout."""
records: List[Dict[str, Any]] = []
for line in stdout.strip().split('\n'):
line = line.strip()
if not line.startswith('{'):
continue
try:
records.append(json.loads(line))
except json.JSONDecodeError:
continue
return records
def apply_machine_updates(records: List[Dict[str, Any]], env: dict) -> None:
"""Apply Machine update records to env dict in-place."""
for record in records:
if record.get('type') != 'Machine':
continue
config = record.get('config')
if not isinstance(config, dict):
continue
env.update(config)
def install_chromium_with_hooks(env: dict, timeout: int = 300) -> str:
"""Install Chromium via chrome crawl hook + puppeteer/npm hooks.
Returns absolute path to Chromium binary.
"""
puppeteer_result = subprocess.run(
[sys.executable, str(PUPPETEER_CRAWL_HOOK)],
capture_output=True,
text=True,
timeout=timeout,
env=env,
)
if puppeteer_result.returncode != 0:
raise RuntimeError(f"Puppeteer crawl hook failed: {puppeteer_result.stderr}")
puppeteer_record = parse_jsonl_output(puppeteer_result.stdout, record_type='Binary') or {}
if not puppeteer_record or puppeteer_record.get('name') != 'puppeteer':
raise RuntimeError("Puppeteer Binary record not emitted by crawl hook")
npm_cmd = [
sys.executable,
str(NPM_BINARY_HOOK),
'--machine-id=test-machine',
'--binary-id=test-puppeteer',
'--name=puppeteer',
f"--binproviders={puppeteer_record.get('binproviders', '*')}",
]
puppeteer_overrides = puppeteer_record.get('overrides')
if puppeteer_overrides:
npm_cmd.append(f'--overrides={json.dumps(puppeteer_overrides)}')
npm_result = subprocess.run(
npm_cmd,
capture_output=True,
text=True,
timeout=timeout,
env=env,
)
if npm_result.returncode != 0:
raise RuntimeError(f"Npm install failed: {npm_result.stderr}")
apply_machine_updates(parse_jsonl_records(npm_result.stdout), env)
chrome_result = subprocess.run(
[sys.executable, str(CHROME_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=timeout,
env=env,
)
if chrome_result.returncode != 0:
raise RuntimeError(f"Chrome install hook failed: {chrome_result.stderr}")
chrome_record = parse_jsonl_output(chrome_result.stdout, record_type='Binary') or {}
if not chrome_record or chrome_record.get('name') not in ('chromium', 'chrome'):
raise RuntimeError("Chrome Binary record not emitted by crawl hook")
chromium_cmd = [
sys.executable,
str(PUPPETEER_BINARY_HOOK),
'--machine-id=test-machine',
'--binary-id=test-chromium',
f"--name={chrome_record.get('name', 'chromium')}",
f"--binproviders={chrome_record.get('binproviders', '*')}",
]
chrome_overrides = chrome_record.get('overrides')
if chrome_overrides:
chromium_cmd.append(f'--overrides={json.dumps(chrome_overrides)}')
result = subprocess.run(
chromium_cmd,
capture_output=True,
text=True,
timeout=timeout,
env=env,
)
if result.returncode != 0:
raise RuntimeError(f"Puppeteer chromium install failed: {result.stderr}")
records = parse_jsonl_records(result.stdout)
chromium_record = None
for record in records:
if record.get('type') == 'Binary' and record.get('name') in ('chromium', 'chrome'):
chromium_record = record
break
if not chromium_record:
chromium_record = parse_jsonl_output(result.stdout, record_type='Binary')
chromium_path = chromium_record.get('abspath')
if not chromium_path or not Path(chromium_path).exists():
raise RuntimeError(f"Chromium binary not found after install: {chromium_path}")
env['CHROME_BINARY'] = chromium_path
apply_machine_updates(records, env)
return chromium_path
def run_hook_and_parse(
hook_script: Path,
url: str,
@@ -499,7 +625,7 @@ def setup_test_env(tmpdir: Path) -> dict:
crawls/
snapshots/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Calls chrome install hook + puppeteer/npm hooks for Chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
Args:
@@ -559,31 +685,10 @@ def setup_test_env(tmpdir: Path) -> dict:
if 'CHROME_HEADLESS' not in os.environ:
env['CHROME_HEADLESS'] = 'true'
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=120, env=env
)
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
# Parse JSONL output to get CHROME_BINARY
chrome_binary = None
for line in result.stdout.strip().split('\n'):
if not line.strip():
continue
try:
data = json.loads(line)
if data.get('type') == 'Binary' and data.get('abspath'):
chrome_binary = data['abspath']
break
except json.JSONDecodeError:
continue
if not chrome_binary or not Path(chrome_binary).exists():
pytest.skip(f"Chromium binary not found: {chrome_binary}")
env['CHROME_BINARY'] = chrome_binary
try:
install_chromium_with_hooks(env)
except RuntimeError as e:
pytest.skip(str(e))
return env
@@ -790,17 +895,8 @@ def chrome_session(
'CHROME_HEADLESS': 'true',
})
# CRITICAL: Run chrome install hook first (installs puppeteer-core and chromium)
# chrome_launch assumes chrome_install has already run
install_result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=120,
env=env
)
if install_result.returncode != 0:
raise RuntimeError(f"Chrome install failed: {install_result.stderr}")
# Install Chromium via npm + puppeteer hooks using normal Binary flow
install_chromium_with_hooks(env)
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(

View File

@@ -30,9 +30,8 @@ import platform
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
get_lib_dir,
get_node_modules_dir,
find_chromium_binary,
install_chromium_with_hooks,
CHROME_PLUGIN_DIR as PLUGIN_DIR,
CHROME_LAUNCH_HOOK,
CHROME_TAB_HOOK,
@@ -41,58 +40,24 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
@pytest.fixture(scope="session", autouse=True)
def ensure_chromium_and_puppeteer_installed(tmp_path_factory):
"""Ensure Chromium and puppeteer are installed before running tests.
Puppeteer handles Chromium installation automatically in its own cache.
We only need to install puppeteer itself to LIB_DIR/npm.
"""
from abx_pkg import Binary, NpmProvider, BinProviderOverrides
# Set DATA_DIR if not already set (required by abx_pkg)
"""Ensure Chromium and puppeteer are installed before running tests."""
if not os.environ.get('DATA_DIR'):
# Use isolated temp dir for direct pytest runs
test_data_dir = tmp_path_factory.mktemp('chrome_test_data')
os.environ['DATA_DIR'] = str(test_data_dir)
env = get_test_env()
# Compute paths AFTER setting DATA_DIR
lib_dir = get_lib_dir()
node_modules_dir = get_node_modules_dir()
npm_prefix = lib_dir / 'npm'
try:
chromium_binary = install_chromium_with_hooks(env)
except RuntimeError as e:
pytest.skip(str(e))
# Rebuild pydantic models
NpmProvider.model_rebuild()
# Install puppeteer if not available (it will handle Chromium in its own cache)
puppeteer_core_path = node_modules_dir / 'puppeteer-core'
if not puppeteer_core_path.exists():
print(f"\n[*] Installing puppeteer to {npm_prefix}...")
npm_prefix.mkdir(parents=True, exist_ok=True)
provider = NpmProvider(npm_prefix=npm_prefix)
try:
binary = Binary(
name='puppeteer',
binproviders=[provider],
overrides={'npm': {'packages': ['puppeteer@^23.5.0']}}
)
binary.install()
print(f"[*] Puppeteer installed successfully to {npm_prefix}")
except Exception as e:
pytest.skip(f"Failed to install puppeteer: {e}")
# Find Chromium binary (puppeteer installs it automatically in its cache)
chromium_binary = find_chromium_binary()
if not chromium_binary:
pytest.skip("Chromium not found - puppeteer should install it automatically")
pytest.skip("Chromium not found after install")
# Set CHROME_BINARY env var for tests
os.environ['CHROME_BINARY'] = chromium_binary
# Get paths from helpers (will use DATA_DIR if set, or compute based on __file__)
LIB_DIR = get_lib_dir()
NODE_MODULES_DIR = get_node_modules_dir()
NPM_PREFIX = LIB_DIR / 'npm'
for key in ('NODE_MODULES_DIR', 'NODE_PATH', 'PATH'):
if env.get(key):
os.environ[key] = env[key]
def test_hook_scripts_exist():

View File

@@ -32,6 +32,13 @@ const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'console.jsonl';
const CHROME_SESSION_DIR = '../chrome';
let browser = null;
let page = null;
let logCount = 0;
let errorCount = 0;
let requestFailCount = 0;
let shuttingDown = false;
async function serializeArgs(args) {
const serialized = [];
for (const arg of args) {
@@ -73,6 +80,7 @@ async function setupListeners() {
location: msg.location(),
};
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
logCount += 1;
} catch (e) {
// Ignore errors
}
@@ -87,6 +95,7 @@ async function setupListeners() {
stack: error.stack || '',
};
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
errorCount += 1;
} catch (e) {
// Ignore
}
@@ -103,6 +112,7 @@ async function setupListeners() {
url: request.url(),
};
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
requestFailCount += 1;
} catch (e) {
// Ignore
}
@@ -111,6 +121,29 @@ async function setupListeners() {
return { browser, page };
}
function emitResult(status = 'succeeded') {
if (shuttingDown) return;
shuttingDown = true;
const counts = `${logCount} console, ${errorCount} errors, ${requestFailCount} failed requests`;
console.log(JSON.stringify({
type: 'ArchiveResult',
status,
output_str: `${OUTPUT_FILE} (${counts})`,
}));
}
async function handleShutdown(signal) {
console.error(`\nReceived ${signal}, emitting final results...`);
emitResult('succeeded');
if (browser) {
try {
browser.disconnect();
} catch (e) {}
}
process.exit(0);
}
async function main() {
const args = parseArgs();
const url = args.url;
@@ -127,23 +160,27 @@ async function main() {
process.exit(0);
}
const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000;
try {
// Set up listeners BEFORE navigation
await setupListeners();
const connection = await setupListeners();
browser = connection.browser;
page = connection.page;
// Wait for chrome_navigate to complete (BLOCKING)
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
// Register signal handlers for graceful shutdown
process.on('SIGTERM', () => handleShutdown('SIGTERM'));
process.on('SIGINT', () => handleShutdown('SIGINT'));
// Output clean JSONL
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: OUTPUT_FILE,
}));
// Wait for chrome_navigate to complete (non-fatal)
try {
const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000;
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
} catch (e) {
console.error(`WARN: ${e.message}`);
}
process.exit(0);
// console.error('Consolelog active, waiting for cleanup signal...');
await new Promise(() => {}); // Keep alive until SIGTERM
return;
} catch (e) {
const error = `${e.name}: ${e.message}`;

View File

@@ -0,0 +1 @@
<span class="abx-output-icon abx-output-icon--consolelog" title="Console Log"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="4.5" width="18" height="15" rx="2"/><path d="M7 12l2 2-2 2"/><path d="M11 16h6"/></svg></span>

View File

@@ -1 +0,0 @@
"""Tests for the consolelog plugin."""

View File

@@ -10,6 +10,7 @@ import shutil
import subprocess
import sys
import tempfile
import time
from pathlib import Path
import pytest
@@ -76,26 +77,33 @@ class TestConsolelogWithChrome(TestCase):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
# Run consolelog hook with the active Chrome session
result = subprocess.run(
# Run consolelog hook with the active Chrome session (background hook)
result = subprocess.Popen(
['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=120, # Longer timeout as it waits for navigation
env=env
)
# Check for output file
console_output = snapshot_chrome_dir / 'console.jsonl'
# Verify hook ran (may succeed or timeout waiting for navigation)
# The hook is designed to wait for page_loaded.txt from chrome_navigate
# In test mode, that file may not exist, so hook may timeout
# But it should still create the console.jsonl file
# Allow it to run briefly, then terminate (background hook)
time.sleep(3)
if result.poll() is None:
result.terminate()
try:
stdout, stderr = result.communicate(timeout=5)
except subprocess.TimeoutExpired:
result.kill()
stdout, stderr = result.communicate()
else:
stdout, stderr = result.communicate()
# At minimum, verify no crash
self.assertNotIn('Traceback', result.stderr)
self.assertNotIn('Traceback', stderr)
# If output file exists, verify it's valid JSONL
if console_output.exists():

View File

@@ -59,9 +59,16 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
provider = EnvProvider()
try:
binary = Binary(name=name, binproviders=[provider]).load()
except Exception as e:
click.echo(f"{name} not found after custom install: {e}", err=True)
sys.exit(1)
except Exception:
try:
binary = Binary(
name=name,
binproviders=[provider],
overrides={'env': {'version': '0.0.1'}},
).load()
except Exception as e:
click.echo(f"{name} not found after custom install: {e}", err=True)
sys.exit(1)
if not binary.abspath:
click.echo(f"{name} not found after custom install", err=True)

View File

@@ -1 +0,0 @@
"""Tests for the custom binary provider plugin."""

View File

@@ -17,7 +17,7 @@ from django.test import TestCase
# Get the path to the custom provider hook
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_custom_bash.py'
INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_custom_install.py'), None)
class TestCustomProviderHook(TestCase):
@@ -34,7 +34,7 @@ class TestCustomProviderHook(TestCase):
def test_hook_script_exists(self):
"""Hook script should exist."""
self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
def test_hook_skips_when_custom_not_allowed(self):
"""Hook should skip when custom not in allowed binproviders."""

View File

@@ -32,6 +32,11 @@ const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'dns.jsonl';
const CHROME_SESSION_DIR = '../chrome';
let browser = null;
let page = null;
let recordCount = 0;
let shuttingDown = false;
function extractHostname(url) {
try {
const urlObj = new URL(url);
@@ -121,6 +126,7 @@ async function setupListener(targetUrl) {
// Append to output file
fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n');
recordCount += 1;
} catch (e) {
// Ignore errors
@@ -170,6 +176,7 @@ async function setupListener(targetUrl) {
};
fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n');
recordCount += 1;
}
} catch (e) {
// Ignore errors
@@ -179,6 +186,28 @@ async function setupListener(targetUrl) {
return { browser, page, client };
}
function emitResult(status = 'succeeded') {
if (shuttingDown) return;
shuttingDown = true;
console.log(JSON.stringify({
type: 'ArchiveResult',
status,
output_str: `${OUTPUT_FILE} (${recordCount} DNS records)`,
}));
}
async function handleShutdown(signal) {
console.error(`\nReceived ${signal}, emitting final results...`);
emitResult('succeeded');
if (browser) {
try {
browser.disconnect();
} catch (e) {}
}
process.exit(0);
}
async function main() {
const args = parseArgs();
const url = args.url;
@@ -195,31 +224,27 @@ async function main() {
process.exit(0);
}
const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000;
try {
// Set up listener BEFORE navigation
await setupListener(url);
const connection = await setupListener(url);
browser = connection.browser;
page = connection.page;
// Wait for chrome_navigate to complete (BLOCKING)
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
// Register signal handlers for graceful shutdown
process.on('SIGTERM', () => handleShutdown('SIGTERM'));
process.on('SIGINT', () => handleShutdown('SIGINT'));
// Count DNS records
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let recordCount = 0;
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf8');
recordCount = content.split('\n').filter(line => line.trim()).length;
// Wait for chrome_navigate to complete (non-fatal)
try {
const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000;
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
} catch (e) {
console.error(`WARN: ${e.message}`);
}
// Output clean JSONL
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: `${OUTPUT_FILE} (${recordCount} DNS records)`,
}));
process.exit(0);
// console.error('DNS listener active, waiting for cleanup signal...');
await new Promise(() => {}); // Keep alive until SIGTERM
return;
} catch (e) {
const error = `${e.name}: ${e.message}`;

View File

@@ -0,0 +1 @@
<span class="abx-output-icon abx-output-icon--dns" title="DNS"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="6" cy="12" r="2"/><circle cx="18" cy="6" r="2"/><circle cx="18" cy="18" r="2"/><path d="M8 12h6"/><path d="M16 8l-2 2"/><path d="M16 16l-2-2"/></svg></span>

View File

@@ -52,7 +52,21 @@ const CHROME_SESSION_DIR = '../chrome';
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
if (!fs.existsSync(STATICFILE_DIR)) return false;
const stdoutPath = path.join(STATICFILE_DIR, 'stdout.log');
if (!fs.existsSync(stdoutPath)) return false;
const stdout = fs.readFileSync(stdoutPath, 'utf8');
for (const line of stdout.split('\n')) {
const trimmed = line.trim();
if (!trimmed.startsWith('{')) continue;
try {
const record = JSON.parse(trimmed);
if (record.type === 'ArchiveResult' && record.status === 'succeeded') {
return true;
}
} catch (e) {}
}
return false;
}
// Wait for chrome tab to be fully loaded

View File

@@ -1 +1 @@
🌐
<span class="abx-output-icon abx-output-icon--dom" title="DOM"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M8 9l-3 3 3 3"/><path d="M16 9l3 3-3 3"/><path d="M10 20l4-16"/></svg></span>

View File

@@ -142,7 +142,7 @@ def test_staticfile_present_skips():
# dom/ <- dom extractor runs here, looks for ../staticfile
staticfile_dir = tmpdir / 'staticfile'
staticfile_dir.mkdir()
(staticfile_dir / 'index.html').write_text('<html>test</html>')
(staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n')
dom_dir = tmpdir / 'dom'
dom_dir.mkdir()

View File

@@ -25,7 +25,8 @@ from abx_pkg import Binary, EnvProvider
@click.option('--binary-id', required=True, help="Dependency UUID")
@click.option('--name', required=True, help="Binary name to find")
@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)")
def main(binary_id: str, machine_id: str, name: str, binproviders: str):
@click.option('--overrides', default=None, help="JSON-encoded overrides dict (unused)")
def main(binary_id: str, machine_id: str, name: str, binproviders: str, overrides: str | None):
"""Check if binary is available in PATH and record it."""
# Check if env provider is allowed

View File

@@ -1 +0,0 @@
"""Tests for the env binary provider plugin."""

View File

@@ -17,7 +17,7 @@ from django.test import TestCase
# Get the path to the env provider hook
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_env_provider.py'
INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_env_install.py'), None)
class TestEnvProviderHook(TestCase):
@@ -34,7 +34,7 @@ class TestEnvProviderHook(TestCase):
def test_hook_script_exists(self):
"""Hook script should exist."""
self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
def test_hook_finds_python(self):
"""Hook should find python3 binary in PATH."""

View File

@@ -126,7 +126,12 @@ def main(url: str, snapshot_id: str):
try:
# Run extraction
success, output, error = get_favicon(url)
status = 'succeeded' if success else 'failed'
if success:
status = 'succeeded'
elif error == 'No favicon found':
status = 'skipped'
else:
status = 'failed'
except Exception as e:
error = f'{type(e).__name__}: {e}'
@@ -143,7 +148,7 @@ def main(url: str, snapshot_id: str):
}
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
if __name__ == '__main__':

View File

@@ -1 +1 @@
<span class="abx-output-icon abx-output-icon--favicon" title="Favicon"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M12 3l2.5 5.5 6 .5-4.5 3.8 1.5 5.7L12 15.5 6.5 18.5 8 12.8 3.5 9l6-.5z"/></svg></span>

View File

@@ -1 +0,0 @@
{"type": "Binary", "name": "forum-dl", "binproviders": "pip,env"}

View File

@@ -1,80 +0,0 @@
#!/usr/bin/env python3
"""
Detect forum-dl binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if forum-dl is found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True)
forumdl_binary = get_env('FORUMDL_BINARY', 'forum-dl')
if not forumdl_enabled:
sys.exit(0)
provider = EnvProvider()
try:
binary = Binary(name=forumdl_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='forum-dl')
else:
# Binary not found
output_binary_missing(name='forum-dl', binproviders='pip')
except Exception:
# Binary not found
output_binary_missing(name='forum-dl', binproviders='pip')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,79 @@
#!/usr/bin/env python3
"""
Emit forum-dl Binary dependency for the crawl.
"""
import json
import os
import sys
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary(name: str, binproviders: str, overrides: dict | None = None):
"""Output Binary JSONL record for a dependency."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders,
'machine_id': machine_id,
}
if overrides:
record['overrides'] = overrides
print(json.dumps(record))
def main():
forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True)
if not forumdl_enabled:
sys.exit(0)
output_binary(
name='forum-dl',
binproviders='pip,env',
overrides={
'pip': {
'packages': [
'--no-deps',
'forum-dl',
'pydantic',
'pydantic-core',
'typing-extensions',
'annotated-types',
'typing-inspection',
'beautifulsoup4',
'soupsieve',
'lxml',
'requests',
'urllib3',
'certifi',
'idna',
'charset-normalizer',
'tenacity',
'python-dateutil',
'six',
'html2text',
'warcio',
]
}
},
)
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -2,7 +2,7 @@
"""
Download forum content from a URL using forum-dl.
Usage: on_Snapshot__forumdl.py --url=<url> --snapshot-id=<uuid>
Usage: on_Snapshot__04_forumdl.bg.py --url=<url> --snapshot-id=<uuid>
Output: Downloads forum content to $PWD/
Environment variables:
@@ -19,6 +19,7 @@ import json
import os
import subprocess
import sys
import threading
from pathlib import Path
import rich_click as click
@@ -131,13 +132,41 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
cmd.append(url)
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
print(f'[forumdl] Starting download (timeout={timeout}s)', file=sys.stderr)
output_lines: list[str] = []
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
)
def _read_output() -> None:
if not process.stdout:
return
for line in process.stdout:
output_lines.append(line)
sys.stderr.write(line)
reader = threading.Thread(target=_read_output, daemon=True)
reader.start()
try:
process.wait(timeout=timeout)
except subprocess.TimeoutExpired:
process.kill()
reader.join(timeout=1)
return False, None, f'Timed out after {timeout} seconds'
reader.join(timeout=1)
combined_output = ''.join(output_lines)
# Check if output file was created
if output_file.exists() and output_file.stat().st_size > 0:
return True, str(output_file), ''
else:
stderr = result.stderr
stderr = combined_output
# These are NOT errors - page simply has no downloadable forum content
stderr_lower = stderr.lower()
@@ -147,7 +176,7 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
return True, None, '' # No forum found - success, no output
if 'extractornotfounderror' in stderr_lower:
return True, None, '' # No forum extractor for this URL - success, no output
if result.returncode == 0:
if process.returncode == 0:
return True, None, '' # forum-dl exited cleanly, just no forum - success
# These ARE errors - something went wrong

View File

@@ -1 +1 @@
💬
<span class="abx-output-icon abx-output-icon--forumdl" title="Forum"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M4 5h16v10H7l-3 3V5z"/></svg></span>

View File

@@ -1 +0,0 @@
{"type": "Binary", "name": "gallery-dl", "binproviders": "pip,brew,apt,env"}

View File

@@ -1,80 +0,0 @@
#!/usr/bin/env python3
"""
Detect gallery-dl binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if gallery-dl is found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True)
gallerydl_binary = get_env('GALLERYDL_BINARY', 'gallery-dl')
if not gallerydl_enabled:
sys.exit(0)
provider = EnvProvider()
try:
binary = Binary(name=gallerydl_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='gallery-dl')
else:
# Binary not found
output_binary_missing(name='gallery-dl', binproviders='pip')
except Exception:
# Binary not found
output_binary_missing(name='gallery-dl', binproviders='pip')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,48 @@
#!/usr/bin/env python3
"""
Emit gallery-dl Binary dependency for the crawl.
"""
import json
import os
import sys
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary(name: str, binproviders: str):
"""Output Binary JSONL record for a dependency."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders,
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True)
if not gallerydl_enabled:
sys.exit(0)
output_binary(name='gallery-dl', binproviders='pip,brew,apt,env')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -2,7 +2,7 @@
"""
Download image galleries from a URL using gallery-dl.
Usage: on_Snapshot__gallerydl.py --url=<url> --snapshot-id=<uuid>
Usage: on_Snapshot__03_gallerydl.bg.py --url=<url> --snapshot-id=<uuid>
Output: Downloads gallery images to $PWD/gallerydl/
Environment variables:
@@ -19,6 +19,7 @@ import json
import os
import subprocess
import sys
import threading
from pathlib import Path
import rich_click as click
@@ -70,7 +71,22 @@ STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
staticfile_dir = Path(STATICFILE_DIR)
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
if not staticfile_dir.exists():
return False
stdout_log = staticfile_dir / 'stdout.log'
if not stdout_log.exists():
return False
for line in stdout_log.read_text(errors='ignore').splitlines():
line = line.strip()
if not line.startswith('{'):
continue
try:
record = json.loads(line)
except json.JSONDecodeError:
continue
if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded':
return True
return False
def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
@@ -109,7 +125,35 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
cmd.append(url)
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
print(f'[gallerydl] Starting download (timeout={timeout}s)', file=sys.stderr)
output_lines: list[str] = []
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
)
def _read_output() -> None:
if not process.stdout:
return
for line in process.stdout:
output_lines.append(line)
sys.stderr.write(line)
reader = threading.Thread(target=_read_output, daemon=True)
reader.start()
try:
process.wait(timeout=timeout)
except subprocess.TimeoutExpired:
process.kill()
reader.join(timeout=1)
return False, None, f'Timed out after {timeout} seconds'
reader.join(timeout=1)
combined_output = ''.join(output_lines)
# Check if any gallery files were downloaded (search recursively)
gallery_extensions = (
@@ -132,7 +176,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
output = str(image_files[0]) if image_files else str(downloaded_files[0])
return True, output, ''
else:
stderr = result.stderr
stderr = combined_output
# These are NOT errors - page simply has no downloadable gallery
# Return success with no output (legitimate "nothing to download")
@@ -141,7 +185,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
return True, None, '' # Not a gallery site - success, no output
if 'no results' in stderr_lower:
return True, None, '' # No gallery found - success, no output
if result.returncode == 0:
if process.returncode == 0:
return True, None, '' # gallery-dl exited cleanly, just no gallery - success
# These ARE errors - something went wrong

View File

@@ -1 +1 @@
🖼️
<span class="abx-output-icon abx-output-icon--gallerydl" title="Gallery"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="5" width="18" height="14" rx="2"/><circle cx="8" cy="10" r="1.5" fill="currentColor" stroke="none"/><path d="M21 17l-5-5-5 5"/></svg></span>

View File

@@ -1 +0,0 @@
{"type": "Binary", "name": "git", "binproviders": "apt,brew,env"}

View File

@@ -0,0 +1,48 @@
#!/usr/bin/env python3
"""
Emit git Binary dependency for the crawl.
"""
import json
import os
import sys
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary(name: str, binproviders: str):
"""Output Binary JSONL record for a dependency."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders,
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
git_enabled = get_env_bool('GIT_ENABLED', True)
if not git_enabled:
sys.exit(0)
output_binary(name='git', binproviders='apt,brew,env')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -1,80 +0,0 @@
#!/usr/bin/env python3
"""
Detect git binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if git is found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
git_enabled = get_env_bool('GIT_ENABLED', True)
git_binary = get_env('GIT_BINARY', 'git')
if not git_enabled:
sys.exit(0)
provider = EnvProvider()
try:
binary = Binary(name=git_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='git')
else:
# Binary not found
output_binary_missing(name='git', binproviders='apt,brew')
except Exception:
# Binary not found
output_binary_missing(name='git', binproviders='apt,brew')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -2,7 +2,7 @@
"""
Clone a git repository from a URL.
Usage: on_Snapshot__git.py --url=<url> --snapshot-id=<uuid>
Usage: on_Snapshot__05_git.bg.py --url=<url> --snapshot-id=<uuid>
Output: Clones repository to $PWD/repo
Environment variables:

View File

@@ -1 +1 @@
📂
<span class="abx-output-icon abx-output-icon--git" title="Git"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="6" cy="6" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="12" r="2"/><path d="M8 6h5a3 3 0 0 1 3 3v1"/><path d="M8 18h5a3 3 0 0 0 3-3v-1"/></svg></span>

View File

@@ -1 +1 @@
📋
<span class="abx-output-icon abx-output-icon--headers" title="Headers"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="4" cy="7" r="1" fill="currentColor" stroke="none"/><circle cx="4" cy="12" r="1" fill="currentColor" stroke="none"/><circle cx="4" cy="17" r="1" fill="currentColor" stroke="none"/><path d="M7 7h13"/><path d="M7 12h13"/><path d="M7 17h13"/></svg></span>

View File

@@ -76,22 +76,28 @@ def find_html_source() -> str | None:
# Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
search_patterns = [
'singlefile/singlefile.html',
'*_singlefile/singlefile.html',
'singlefile/*.html',
'*_singlefile/*.html',
'dom/output.html',
'*_dom/output.html',
'dom/*.html',
'*_dom/*.html',
'wget/**/*.html',
'*_wget/**/*.html',
'wget/**/*.htm',
'*_wget/**/*.htm',
]
cwd = Path.cwd()
for pattern in search_patterns:
matches = list(cwd.glob(pattern))
for match in matches:
if match.is_file() and match.stat().st_size > 0:
try:
return match.read_text(errors='ignore')
except Exception:
continue
for base in (Path.cwd(), Path.cwd().parent):
for pattern in search_patterns:
matches = list(base.glob(pattern))
for match in matches:
if match.is_file() and match.stat().st_size > 0:
try:
return match.read_text(errors='ignore')
except Exception:
continue
return None

View File

@@ -1 +1 @@
📃
<span class="abx-output-icon abx-output-icon--htmltotext" title="HTML to Text"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M4 7h16"/><path d="M4 12h12"/><path d="M4 17h14"/></svg></span>

View File

@@ -0,0 +1 @@
<span class="abx-output-icon abx-output-icon--infiniscroll" title="Infinite Scroll"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M12 5v9"/><path d="M8 10l4 4 4-4"/><circle cx="6" cy="19" r="1" fill="currentColor" stroke="none"/><circle cx="12" cy="19" r="1" fill="currentColor" stroke="none"/><circle cx="18" cy="19" r="1" fill="currentColor" stroke="none"/></svg></span>

View File

@@ -7,7 +7,7 @@
*
* Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm
*
* Priority: 02 (early) - Must install before Chrome session starts at Crawl level
* Priority: 81 - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:

View File

@@ -1 +0,0 @@
{"type": "Binary", "name": "postlight-parser", "binproviders": "npm,env", "overrides": {"npm": {"packages": ["@postlight/parser"]}}}

View File

@@ -1,85 +0,0 @@
#!/usr/bin/env python3
"""
Detect postlight-parser binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if postlight-parser is found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'overrides': {
'npm': {
'packages': ['@postlight/parser'],
}
},
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
mercury_enabled = get_env_bool('MERCURY_ENABLED', True)
mercury_binary = get_env('MERCURY_BINARY', 'postlight-parser')
if not mercury_enabled:
sys.exit(0)
provider = EnvProvider()
try:
binary = Binary(name=mercury_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='postlight-parser')
else:
# Binary not found
output_binary_missing(name='postlight-parser', binproviders='npm')
except Exception:
# Binary not found
output_binary_missing(name='postlight-parser', binproviders='npm')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,53 @@
#!/usr/bin/env python3
"""
Emit postlight-parser Binary dependency for the crawl.
"""
import json
import os
import sys
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary(name: str, binproviders: str):
"""Output Binary JSONL record for a dependency."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders,
'overrides': {
'npm': {
'packages': ['@postlight/parser'],
}
},
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
mercury_enabled = get_env_bool('MERCURY_ENABLED', True)
if not mercury_enabled:
sys.exit(0)
output_binary(name='postlight-parser', binproviders='npm,env')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -1 +1 @@
☿️
<span class="abx-output-icon abx-output-icon--mercury" title="Mercury"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="5" width="18" height="14" rx="2"/><path d="M7 9h6"/><path d="M7 13h10"/><path d="M15 9h3"/></svg></span>

View File

@@ -0,0 +1 @@
<span class="abx-output-icon abx-output-icon--merkletree" title="Merkle Tree"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="5" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="18" r="2"/><path d="M12 7v6"/><path d="M12 13l-4 3"/><path d="M12 13l4 3"/></svg></span>

View File

@@ -1 +0,0 @@
"""Tests for the merkletree plugin."""

View File

@@ -287,7 +287,7 @@ async function main() {
page = pages[pages.length - 1];
}
console.error(`Modalcloser listening on ${url}`);
// console.error(`Modalcloser listening on ${url}`);
// Set up dialog handler (for JS alert/confirm/prompt/beforeunload)
page.on('dialog', async (dialog) => {

View File

@@ -0,0 +1 @@
<span class="abx-output-icon abx-output-icon--modalcloser" title="Modal Closer"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="4" y="4" width="16" height="16" rx="3"/><path d="M9 9l6 6"/><path d="M15 9l-6 6"/></svg></span>

View File

@@ -90,30 +90,34 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
}
print(json.dumps(record))
# Emit PATH update if npm bin dir not already in PATH
npm_bin_dir = str(npm_prefix / 'bin')
# Emit PATH update for npm bin dirs (node_modules/.bin preferred)
npm_bin_dirs = [
str(npm_prefix / 'node_modules' / '.bin'),
str(npm_prefix / 'bin'),
]
current_path = os.environ.get('PATH', '')
path_dirs = current_path.split(':') if current_path else []
new_path = current_path
# Check if npm_bin_dir is already in PATH
path_dirs = current_path.split(':')
if npm_bin_dir not in path_dirs:
# Prepend npm_bin_dir to PATH
new_path = f"{npm_bin_dir}:{current_path}" if current_path else npm_bin_dir
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/PATH',
'value': new_path,
}))
click.echo(f" Added {npm_bin_dir} to PATH", err=True)
for npm_bin_dir in npm_bin_dirs:
if npm_bin_dir and npm_bin_dir not in path_dirs:
new_path = f"{npm_bin_dir}:{new_path}" if new_path else npm_bin_dir
path_dirs.insert(0, npm_bin_dir)
print(json.dumps({
'type': 'Machine',
'config': {
'PATH': new_path,
},
}))
# Also emit NODE_MODULES_DIR for JS module resolution
node_modules_dir = str(npm_prefix / 'node_modules')
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/NODE_MODULES_DIR',
'value': node_modules_dir,
'config': {
'NODE_MODULES_DIR': node_modules_dir,
},
}))
# Log human-readable info to stderr

View File

@@ -0,0 +1,51 @@
#!/usr/bin/env python3
"""
Emit node/npm Binary dependencies for the crawl.
This hook runs early in the Crawl lifecycle so node/npm are installed
before any npm-based extractors (e.g., puppeteer) run.
"""
import json
import os
import sys
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def output_binary(name: str, binproviders: str, overrides: dict | None = None) -> None:
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders,
'machine_id': machine_id,
}
if overrides:
record['overrides'] = overrides
print(json.dumps(record))
def main() -> None:
output_binary(
name='node',
binproviders='apt,brew,env',
overrides={'apt': {'packages': ['nodejs']}},
)
output_binary(
name='npm',
binproviders='apt,brew,env',
overrides={
'apt': {'packages': ['nodejs', 'npm']},
'brew': {'packages': ['node']},
},
)
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -1 +0,0 @@
"""Tests for the npm binary provider plugin."""

View File

@@ -22,7 +22,7 @@ from django.test import TestCase
# Get the path to the npm provider hook
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_npm_provider.py'
INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_npm_install.py'), None)
def npm_available() -> bool:
@@ -45,7 +45,7 @@ class TestNpmProviderHook(TestCase):
def test_hook_script_exists(self):
"""Hook script should exist."""
self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
def test_hook_requires_lib_dir(self):
"""Hook should fail when LIB_DIR is not set."""

View File

@@ -1 +0,0 @@
{"type": "Binary", "name": "papers-dl", "binproviders": "pip,env"}

View File

@@ -1,80 +0,0 @@
#!/usr/bin/env python3
"""
Detect papers-dl binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if papers-dl is found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
papersdl_enabled = get_env_bool('PAPERSDL_ENABLED', True)
papersdl_binary = get_env('PAPERSDL_BINARY', 'papers-dl')
if not papersdl_enabled:
sys.exit(0)
provider = EnvProvider()
try:
binary = Binary(name=papersdl_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='papers-dl')
else:
# Binary not found
output_binary_missing(name='papers-dl', binproviders='pip')
except Exception:
# Binary not found
output_binary_missing(name='papers-dl', binproviders='pip')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,48 @@
#!/usr/bin/env python3
"""
Emit papers-dl Binary dependency for the crawl.
"""
import json
import os
import sys
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary(name: str, binproviders: str):
"""Output Binary JSONL record for a dependency."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders,
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
papersdl_enabled = get_env_bool('PAPERSDL_ENABLED', True)
if not papersdl_enabled:
sys.exit(0)
output_binary(name='papers-dl', binproviders='pip,env')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -23,6 +23,7 @@ import os
import re
import subprocess
import sys
import threading
from pathlib import Path
import rich_click as click
@@ -108,7 +109,35 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
cmd.extend(papersdl_args_extra)
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
print(f'[papersdl] Starting download (timeout={timeout}s)', file=sys.stderr)
output_lines: list[str] = []
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
)
def _read_output() -> None:
if not process.stdout:
return
for line in process.stdout:
output_lines.append(line)
sys.stderr.write(line)
reader = threading.Thread(target=_read_output, daemon=True)
reader.start()
try:
process.wait(timeout=timeout)
except subprocess.TimeoutExpired:
process.kill()
reader.join(timeout=1)
return False, None, f'Timed out after {timeout} seconds'
reader.join(timeout=1)
combined_output = ''.join(output_lines)
# Check if any PDF files were downloaded
pdf_files = list(output_dir.glob('*.pdf'))
@@ -117,8 +146,8 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
# Return first PDF file
return True, str(pdf_files[0]), ''
else:
stderr = result.stderr
stdout = result.stdout
stderr = combined_output
stdout = combined_output
# These are NOT errors - page simply has no downloadable paper
stderr_lower = stderr.lower()
@@ -127,7 +156,7 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
return True, None, '' # Paper not available - success, no output
if 'no results' in stderr_lower or 'no results' in stdout_lower:
return True, None, '' # No paper found - success, no output
if result.returncode == 0:
if process.returncode == 0:
return True, None, '' # papers-dl exited cleanly, just no paper - success
# These ARE errors - something went wrong

View File

@@ -1 +1 @@
📄
<span class="abx-output-icon abx-output-icon--papersdl" title="Papers"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M14 3H6a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V9z"/><path d="M14 3v6h6"/><path d="M12 12v5"/><path d="M9.5 14.5L12 17l2.5-2.5"/></svg></span>

View File

@@ -193,6 +193,9 @@ async function extractOutlinks(url) {
type: 'Snapshot',
url: href,
plugin: PLUGIN_NAME,
depth: depth + 1,
parent_snapshot_id: snapshotId || undefined,
crawl_id: crawlId || undefined,
})).join('\n');
if (urlsJsonl) {
@@ -214,6 +217,8 @@ async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
const crawlId = args.crawl_id || process.env.CRAWL_ID;
const depth = parseInt(args.depth || process.env.SNAPSHOT_DEPTH || '0', 10) || 0;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__75_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');

View File

@@ -1 +1 @@
🔗
<span class="abx-output-icon abx-output-icon--parse_dom_outlinks" title="Outlinks"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M10 13a4 4 0 0 1 0-6l2-2a4 4 0 0 1 6 6l-1 1"/><path d="M14 11a4 4 0 0 1 0 6l-2 2a4 4 0 0 1-6-6l1-1"/></svg></span>

View File

@@ -1 +0,0 @@
"""Tests for the parse_dom_outlinks plugin."""

View File

@@ -79,8 +79,7 @@ class TestParseDomOutlinksWithChrome(TestCase):
# Run outlinks hook with the active Chrome session
result = subprocess.run(
['node', str(OUTLINKS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,

View File

@@ -24,14 +24,15 @@ from datetime import datetime, timezone
from html import unescape
from html.parser import HTMLParser
from pathlib import Path
from urllib.parse import urljoin, urlparse
from urllib.parse import urljoin, urlparse, urlunparse
import rich_click as click
PLUGIN_NAME = 'parse_html_urls'
# Check if parse_dom_outlinks extractor already ran
DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl')
# Check if parse_dom_outlinks extractor already ran (sibling plugin output dir)
DOM_OUTLINKS_URLS_FILE = Path('..') / 'parse_dom_outlinks' / 'urls.jsonl'
URLS_FILE = Path('urls.jsonl')
# URL regex from archivebox/misc/util.py
@@ -95,8 +96,9 @@ def fix_urljoin_bug(url: str, nesting_limit=5) -> str:
def normalize_url(url: str, root_url: str = None) -> str:
"""Normalize a URL, resolving relative paths if root_url provided."""
url = clean_url_candidate(url)
if not root_url:
return url
return _normalize_trailing_slash(url)
url_is_absolute = url.lower().startswith('http://') or url.lower().startswith('https://')
@@ -110,7 +112,40 @@ def normalize_url(url: str, root_url: str = None) -> str:
if did_urljoin_misbehave(root_url, url, resolved):
resolved = fix_urljoin_bug(resolved)
return resolved
return _normalize_trailing_slash(resolved)
def _normalize_trailing_slash(url: str) -> str:
"""Drop trailing slash for non-root paths when no query/fragment."""
try:
parsed = urlparse(url)
path = parsed.path or ''
if path != '/' and path.endswith('/') and not parsed.query and not parsed.fragment:
path = path.rstrip('/')
return urlunparse((parsed.scheme, parsed.netloc, path, parsed.params, parsed.query, parsed.fragment))
except Exception:
pass
return url
def clean_url_candidate(url: str) -> str:
"""Strip obvious surrounding/trailing punctuation from extracted URLs."""
cleaned = (url or '').strip()
if not cleaned:
return cleaned
# Strip common wrappers
cleaned = cleaned.strip(' \t\r\n')
cleaned = cleaned.strip('"\''"'"'<>[]()')
# Strip trailing punctuation and escape artifacts
cleaned = cleaned.rstrip('.,;:!?)\\\'"')
cleaned = cleaned.rstrip('"')
# Strip leading punctuation artifacts
cleaned = cleaned.lstrip('("'\''<')
return cleaned
def fetch_content(url: str) -> str:
@@ -131,6 +166,43 @@ def fetch_content(url: str) -> str:
return response.read().decode('utf-8', errors='replace')
def find_html_sources() -> list[str]:
"""Find HTML content from other extractors in the snapshot directory."""
search_patterns = [
'readability/content.html',
'*_readability/content.html',
'mercury/content.html',
'*_mercury/content.html',
'singlefile/singlefile.html',
'*_singlefile/singlefile.html',
'singlefile/*.html',
'*_singlefile/*.html',
'dom/output.html',
'*_dom/output.html',
'dom/*.html',
'*_dom/*.html',
'wget/**/*.html',
'*_wget/**/*.html',
'wget/**/*.htm',
'*_wget/**/*.htm',
'wget/**/*.htm*',
'*_wget/**/*.htm*',
]
sources: list[str] = []
for base in (Path.cwd(), Path.cwd().parent):
for pattern in search_patterns:
for match in base.glob(pattern):
if not match.is_file() or match.stat().st_size == 0:
continue
try:
sources.append(match.read_text(errors='ignore'))
except Exception:
continue
return sources
@click.command()
@click.option('--url', required=True, help='HTML URL to parse')
@click.option('--snapshot-id', required=False, help='Parent Snapshot UUID')
@@ -138,6 +210,13 @@ def fetch_content(url: str) -> str:
@click.option('--depth', type=int, default=0, help='Current depth level')
def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
"""Parse HTML and extract href URLs."""
env_depth = os.environ.get('SNAPSHOT_DEPTH')
if env_depth is not None:
try:
depth = int(env_depth)
except Exception:
pass
crawl_id = crawl_id or os.environ.get('CRAWL_ID')
# Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)
# If parse_dom_outlinks ran but found nothing, we still try static HTML parsing as fallback
@@ -145,32 +224,38 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
click.echo(f'Skipping parse_html_urls - parse_dom_outlinks already extracted URLs')
sys.exit(0)
try:
content = fetch_content(url)
except Exception as e:
click.echo(f'Failed to fetch {url}: {e}', err=True)
sys.exit(1)
# Parse HTML for hrefs
parser = HrefParser()
try:
parser.feed(content)
except Exception as e:
click.echo(f'Failed to parse HTML: {e}', err=True)
sys.exit(1)
contents = find_html_sources()
if not contents:
try:
contents = [fetch_content(url)]
except Exception as e:
click.echo(f'Failed to fetch {url}: {e}', err=True)
sys.exit(1)
urls_found = set()
for href in parser.urls:
# Normalize URL
normalized = normalize_url(href, root_url=url)
for content in contents:
# Parse HTML for hrefs
parser = HrefParser()
try:
parser.feed(content)
except Exception:
pass
# Only include http/https URLs
if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
# Skip the source URL itself
if normalized != url:
urls_found.add(unescape(normalized))
for href in parser.urls:
normalized = normalize_url(href, root_url=url)
if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
if normalized != url:
urls_found.add(unescape(normalized))
# Emit Snapshot records to stdout (JSONL)
# Also capture explicit URLs in the HTML text
for match in URL_REGEX.findall(content):
normalized = normalize_url(match, root_url=url)
if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
if normalized != url:
urls_found.add(unescape(normalized))
# Emit Snapshot records to stdout (JSONL) and urls.jsonl for crawl system
records = []
for found_url in sorted(urls_found):
record = {
'type': 'Snapshot',
@@ -183,8 +268,12 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
if crawl_id:
record['crawl_id'] = crawl_id
records.append(record)
print(json.dumps(record))
if records:
URLS_FILE.write_text('\n'.join(json.dumps(r) for r in records) + '\n')
# Emit ArchiveResult record to mark completion
status = 'succeeded' if urls_found else 'skipped'
output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'

View File

@@ -1 +1 @@
🔗
<span class="abx-output-icon abx-output-icon--parse_html_urls" title="HTML URLs"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M8 9l-3 3 3 3"/><path d="M16 9l3 3-3 3"/><path d="M10 20l4-16"/></svg></span>

View File

@@ -132,6 +132,13 @@ def fetch_content(url: str) -> str:
@click.option('--depth', type=int, default=0, help='Current depth level')
def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
"""Parse JSONL bookmark file and extract URLs."""
env_depth = os.environ.get('SNAPSHOT_DEPTH')
if env_depth is not None:
try:
depth = int(env_depth)
except Exception:
pass
crawl_id = crawl_id or os.environ.get('CRAWL_ID')
try:
content = fetch_content(url)

View File

@@ -1 +1 @@
📋
<span class="abx-output-icon abx-output-icon--parse_jsonl_urls" title="JSONL URLs"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M8 4H5v16h3"/><path d="M16 4h3v16h-3"/><circle cx="12" cy="8" r="1" fill="currentColor" stroke="none"/><circle cx="12" cy="12" r="1" fill="currentColor" stroke="none"/><circle cx="12" cy="16" r="1" fill="currentColor" stroke="none"/></svg></span>

View File

@@ -168,6 +168,13 @@ def fetch_content(url: str) -> str:
@click.option('--depth', type=int, default=0, help='Current depth level')
def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
"""Parse Netscape bookmark HTML and extract URLs."""
env_depth = os.environ.get('SNAPSHOT_DEPTH')
if env_depth is not None:
try:
depth = int(env_depth)
except Exception:
pass
crawl_id = crawl_id or os.environ.get('CRAWL_ID')
try:
content = fetch_content(url)

View File

@@ -1 +1 @@
🔖
<span class="abx-output-icon abx-output-icon--parse_netscape_urls" title="Netscape Bookmarks"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M6 4h12v16l-6-4-6 4z"/></svg></span>

View File

@@ -56,6 +56,13 @@ def fetch_content(url: str) -> str:
@click.option('--depth', type=int, default=0, help='Current depth level')
def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
"""Parse RSS/Atom feed and extract article URLs."""
env_depth = os.environ.get('SNAPSHOT_DEPTH')
if env_depth is not None:
try:
depth = int(env_depth)
except Exception:
pass
crawl_id = crawl_id or os.environ.get('CRAWL_ID')
if feedparser is None:
click.echo('feedparser library not installed', err=True)

View File

@@ -1 +1 @@
📡
<span class="abx-output-icon abx-output-icon--parse_rss_urls" title="RSS"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="5" cy="19" r="1.5" fill="currentColor" stroke="none"/><path d="M5 11a8 8 0 0 1 8 8"/><path d="M5 5a14 14 0 0 1 14 14"/></svg></span>

View File

@@ -105,6 +105,13 @@ def fetch_content(url: str) -> str:
@click.option('--depth', type=int, default=0, help='Current depth level')
def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
"""Parse plain text and extract URLs."""
env_depth = os.environ.get('SNAPSHOT_DEPTH')
if env_depth is not None:
try:
depth = int(env_depth)
except Exception:
pass
crawl_id = crawl_id or os.environ.get('CRAWL_ID')
try:
content = fetch_content(url)

Some files were not shown because too many files have changed in this diff Show More