mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
wip 2
This commit is contained in:
@@ -36,8 +36,9 @@ os.environ['TZ'] = 'UTC'
|
||||
from .config.permissions import drop_privileges # noqa
|
||||
drop_privileges()
|
||||
|
||||
from .misc.checks import check_not_root, check_io_encoding # noqa
|
||||
from .misc.checks import check_not_root, check_not_inside_source_dir, check_io_encoding # noqa
|
||||
check_not_root()
|
||||
check_not_inside_source_dir()
|
||||
check_io_encoding()
|
||||
|
||||
# Install monkey patches for third-party libraries
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
# Generated by Django 5.0.6 on 2024-12-25 (squashed)
|
||||
# Squashed migration: replaces 0001-0009
|
||||
# For fresh installs: creates final schema
|
||||
# For dev users with 0001-0009 applied: marked as applied (no-op)
|
||||
|
||||
from uuid import uuid4
|
||||
from django.conf import settings
|
||||
@@ -12,6 +14,18 @@ class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
replaces = [
|
||||
('api', '0001_initial'),
|
||||
('api', '0002_alter_apitoken_options'),
|
||||
('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
|
||||
('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
|
||||
('api', '0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more'),
|
||||
('api', '0006_remove_outboundwebhook_uuid_apitoken_id_and_more'),
|
||||
('api', '0007_alter_apitoken_created_by'),
|
||||
('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
|
||||
('api', '0009_rename_created_apitoken_created_at_and_more'),
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
@@ -25,9 +25,14 @@ from archivebox.misc.hashing import get_dir_info
|
||||
|
||||
def get_or_create_system_user_pk(username='system'):
|
||||
User = get_user_model()
|
||||
# If there's exactly one superuser, use that for all system operations
|
||||
if User.objects.filter(is_superuser=True).count() == 1:
|
||||
return User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0]
|
||||
user, _ = User.objects.get_or_create(username=username, is_staff=True, is_superuser=True, defaults={'email': '', 'password': ''})
|
||||
# Otherwise get or create the system user
|
||||
user, _ = User.objects.get_or_create(
|
||||
username=username,
|
||||
defaults={'is_staff': True, 'is_superuser': True, 'email': '', 'password': '!'}
|
||||
)
|
||||
return user.pk
|
||||
|
||||
|
||||
|
||||
@@ -38,21 +38,18 @@ def remove(filter_patterns: Iterable[str]=(),
|
||||
setup_django()
|
||||
check_data_folder()
|
||||
|
||||
from archivebox.cli.archivebox_search import list_links
|
||||
|
||||
list_kwargs = {
|
||||
"filter_patterns": filter_patterns,
|
||||
"filter_type": filter_type,
|
||||
"after": after,
|
||||
"before": before,
|
||||
}
|
||||
if snapshots:
|
||||
list_kwargs["snapshots"] = snapshots
|
||||
from archivebox.cli.archivebox_search import get_snapshots
|
||||
|
||||
log_list_started(filter_patterns, filter_type)
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
try:
|
||||
snapshots = list_links(**list_kwargs)
|
||||
snapshots = get_snapshots(
|
||||
snapshots=snapshots,
|
||||
filter_patterns=list(filter_patterns) if filter_patterns else None,
|
||||
filter_type=filter_type,
|
||||
after=after,
|
||||
before=before,
|
||||
)
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER
|
||||
#############################################################################################
|
||||
|
||||
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
||||
DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
|
||||
DATA_DIR: Path = Path(os.environ.get('DATA_DIR', os.getcwd())).resolve() # archivebox user data dir
|
||||
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
|
||||
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
|
||||
@@ -13,7 +13,7 @@ from typing import Optional
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
|
||||
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
||||
DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
|
||||
DATA_DIR: Path = Path(os.environ.get('DATA_DIR', os.getcwd())).resolve() # archivebox user data dir
|
||||
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
|
||||
|
||||
#############################################################################################
|
||||
|
||||
@@ -6,8 +6,24 @@ from pathlib import Path
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
from config import CONFIG
|
||||
from index.json import to_json
|
||||
# Handle old vs new import paths
|
||||
try:
|
||||
from archivebox.config import CONSTANTS
|
||||
ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR
|
||||
except ImportError:
|
||||
try:
|
||||
from config import CONFIG
|
||||
ARCHIVE_DIR = Path(CONFIG.get('ARCHIVE_DIR', './archive'))
|
||||
except ImportError:
|
||||
ARCHIVE_DIR = Path('./archive')
|
||||
|
||||
try:
|
||||
from archivebox.misc.util import to_json
|
||||
except ImportError:
|
||||
try:
|
||||
from index.json import to_json
|
||||
except ImportError:
|
||||
to_json = lambda x: json.dumps(x, indent=4, default=str)
|
||||
|
||||
try:
|
||||
JSONField = models.JSONField
|
||||
@@ -17,14 +33,12 @@ except AttributeError:
|
||||
|
||||
|
||||
def forwards_func(apps, schema_editor):
|
||||
from core.models import EXTRACTORS
|
||||
|
||||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||
|
||||
snapshots = Snapshot.objects.all()
|
||||
for snapshot in snapshots:
|
||||
out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
|
||||
out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp
|
||||
|
||||
try:
|
||||
with open(out_dir / "index.json", "r") as f:
|
||||
@@ -59,7 +73,7 @@ def forwards_func(apps, schema_editor):
|
||||
|
||||
def verify_json_index_integrity(snapshot):
|
||||
results = snapshot.archiveresult_set.all()
|
||||
out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
|
||||
out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp
|
||||
with open(out_dir / "index.json", "r") as f:
|
||||
index = json.load(f)
|
||||
|
||||
|
||||
@@ -169,6 +169,18 @@ class Migration(migrations.Migration):
|
||||
operations = [
|
||||
# === SNAPSHOT CHANGES ===
|
||||
|
||||
# Add health stats fields to Snapshot
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='num_uses_failed',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='num_uses_succeeded',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
|
||||
# Add new fields to Snapshot
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
@@ -266,17 +278,28 @@ class Migration(migrations.Migration):
|
||||
migrations.RemoveField(model_name='snapshot', name='added'),
|
||||
migrations.RemoveField(model_name='snapshot', name='updated'),
|
||||
|
||||
# Remove old 'tags' CharField (now M2M via Tag model)
|
||||
migrations.RemoveField(model_name='snapshot', name='tags'),
|
||||
# Register SnapshotTag through model (table already exists from 0006's ManyToManyField)
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
migrations.CreateModel(
|
||||
name='SnapshotTag',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('snapshot', models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')),
|
||||
('tag', models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag')),
|
||||
],
|
||||
options={
|
||||
'db_table': 'core_snapshot_tags',
|
||||
},
|
||||
),
|
||||
],
|
||||
database_operations=[], # Table already exists from 0006
|
||||
),
|
||||
|
||||
# === TAG CHANGES ===
|
||||
# Tag keeps AutoField (integer) id for migration compatibility
|
||||
|
||||
# Add uuid field to Tag temporarily for ID migration
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='uuid',
|
||||
field=models.UUIDField(default=uuid4, null=True, blank=True),
|
||||
),
|
||||
# Add tracking fields to Tag
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
@@ -298,21 +321,9 @@ class Migration(migrations.Migration):
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
|
||||
# Populate UUIDs for tags
|
||||
migrations.RunPython(generate_uuid_for_tags, migrations.RunPython.noop),
|
||||
# Populate created_by for tags
|
||||
migrations.RunPython(populate_created_by_tag, migrations.RunPython.noop),
|
||||
|
||||
# Make created_by non-nullable
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name='tag_set',
|
||||
to=settings.AUTH_USER_MODEL,
|
||||
),
|
||||
),
|
||||
|
||||
# Update slug field
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
@@ -322,6 +333,18 @@ class Migration(migrations.Migration):
|
||||
|
||||
# === ARCHIVERESULT CHANGES ===
|
||||
|
||||
# Add health stats fields to ArchiveResult
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_failed',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_succeeded',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
|
||||
# Add uuid field for new ID
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
@@ -363,6 +386,11 @@ class Migration(migrations.Migration):
|
||||
name='output_dir',
|
||||
field=models.CharField(max_length=256, default=None, null=True, blank=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='config',
|
||||
field=models.JSONField(default=dict, blank=False),
|
||||
),
|
||||
|
||||
# Populate UUIDs and data for archive results
|
||||
migrations.RunPython(generate_uuid_for_archiveresults, migrations.RunPython.noop),
|
||||
|
||||
40
archivebox/core/migrations/0024_snapshot_crawl.py
Normal file
40
archivebox/core/migrations/0024_snapshot_crawl.py
Normal file
@@ -0,0 +1,40 @@
|
||||
# Generated by Django 5.0.6 on 2024-12-25
|
||||
# Adds crawl FK and iface FK after crawls and machine apps are created
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0023_new_schema'),
|
||||
('crawls', '0001_initial'),
|
||||
('machine', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Add crawl FK to Snapshot
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(
|
||||
default=None, null=True, blank=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name='snapshot_set',
|
||||
to='crawls.crawl',
|
||||
db_index=True,
|
||||
),
|
||||
),
|
||||
|
||||
# Add network interface FK to ArchiveResult
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='iface',
|
||||
field=models.ForeignKey(
|
||||
null=True, blank=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
to='machine.networkinterface',
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -37,9 +37,11 @@ from machine.models import NetworkInterface
|
||||
|
||||
|
||||
class Tag(ModelWithSerializers):
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
# Keep AutoField for compatibility with main branch migrations
|
||||
# Don't use UUIDField here - requires complex FK transformation
|
||||
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set')
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True, null=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
name = models.CharField(unique=True, blank=False, max_length=100)
|
||||
slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
|
||||
@@ -81,16 +83,8 @@ class SnapshotTag(models.Model):
|
||||
unique_together = [('snapshot', 'tag')]
|
||||
|
||||
|
||||
class SnapshotManager(models.Manager):
|
||||
def filter(self, *args, **kwargs):
|
||||
domain = kwargs.pop('domain', None)
|
||||
qs = super().filter(*args, **kwargs)
|
||||
if domain:
|
||||
qs = qs.filter(url__icontains=f'://{domain}')
|
||||
return qs
|
||||
|
||||
def get_queryset(self):
|
||||
return super().get_queryset().prefetch_related('tags', 'archiveresult_set')
|
||||
class SnapshotQuerySet(models.QuerySet):
|
||||
"""Custom QuerySet for Snapshot model with export methods that persist through .filter() etc."""
|
||||
|
||||
# =========================================================================
|
||||
# Filtering Methods
|
||||
@@ -105,7 +99,7 @@ class SnapshotManager(models.Manager):
|
||||
'timestamp': lambda pattern: models.Q(timestamp=pattern),
|
||||
}
|
||||
|
||||
def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> QuerySet:
|
||||
def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> 'SnapshotQuerySet':
|
||||
"""Filter snapshots by URL patterns using specified filter type"""
|
||||
from archivebox.misc.logging import stderr
|
||||
|
||||
@@ -120,7 +114,7 @@ class SnapshotManager(models.Manager):
|
||||
raise SystemExit(2)
|
||||
return self.filter(q_filter)
|
||||
|
||||
def search(self, patterns: List[str]) -> QuerySet:
|
||||
def search(self, patterns: List[str]) -> 'SnapshotQuerySet':
|
||||
"""Search snapshots using the configured search backend"""
|
||||
from archivebox.config.common import SEARCH_BACKEND_CONFIG
|
||||
from archivebox.search import query_search_index
|
||||
@@ -208,6 +202,20 @@ class SnapshotManager(models.Manager):
|
||||
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
|
||||
})
|
||||
|
||||
|
||||
class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
|
||||
"""Manager for Snapshot model - uses SnapshotQuerySet for chainable methods"""
|
||||
|
||||
def filter(self, *args, **kwargs):
|
||||
domain = kwargs.pop('domain', None)
|
||||
qs = super().filter(*args, **kwargs)
|
||||
if domain:
|
||||
qs = qs.filter(url__icontains=f'://{domain}')
|
||||
return qs
|
||||
|
||||
def get_queryset(self):
|
||||
return super().get_queryset().prefetch_related('tags', 'archiveresult_set')
|
||||
|
||||
# =========================================================================
|
||||
# Import Methods
|
||||
# =========================================================================
|
||||
@@ -766,7 +774,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
('dom', 'dom'), ('title', 'title'), ('wget', 'wget'),
|
||||
)
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
# Keep AutoField for backward compatibility with 0.7.x databases
|
||||
# UUID field is added separately by migration for new records
|
||||
id = models.AutoField(primary_key=True, editable=False)
|
||||
uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True, unique=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set', db_index=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
@@ -851,14 +862,22 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
updates status/output fields, queues discovered URLs, and triggers indexing.
|
||||
"""
|
||||
from django.utils import timezone
|
||||
from archivebox.hooks import discover_hooks, run_hook
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
|
||||
|
||||
extractor_dir = Path(self.snapshot.output_dir) / self.extractor
|
||||
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
|
||||
|
||||
# Discover hook for this extractor
|
||||
hooks = discover_hooks(f'Snapshot__{self.extractor}')
|
||||
if not hooks:
|
||||
# Find hook for this extractor
|
||||
hook = None
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
matches = list(base_dir.glob(f'*/on_Snapshot__{self.extractor}.*'))
|
||||
if matches:
|
||||
hook = matches[0]
|
||||
break
|
||||
|
||||
if not hook:
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.output = f'No hook found for: {self.extractor}'
|
||||
self.retry_at = None
|
||||
@@ -868,7 +887,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
# Run the hook
|
||||
start_ts = timezone.now()
|
||||
result = run_hook(
|
||||
hooks[0],
|
||||
hook,
|
||||
output_dir=extractor_dir,
|
||||
config_objects=config_objects,
|
||||
url=self.snapshot.url,
|
||||
|
||||
@@ -5,6 +5,7 @@ import os
|
||||
from datetime import timedelta
|
||||
from typing import ClassVar
|
||||
|
||||
from django.db.models import F
|
||||
from django.utils import timezone
|
||||
|
||||
from rich import print
|
||||
@@ -14,6 +15,7 @@ from statemachine import State, StateMachine
|
||||
# from workers.actor import ActorType
|
||||
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from crawls.models import Crawl, Seed
|
||||
|
||||
|
||||
class SnapshotMachine(StateMachine, strict_states=True):
|
||||
@@ -254,6 +256,18 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
)
|
||||
self.archiveresult.save(write_indexes=True)
|
||||
|
||||
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed
|
||||
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
|
||||
# Also update Crawl and Seed health stats if snapshot has a crawl
|
||||
snapshot = self.archiveresult.snapshot
|
||||
if snapshot.crawl_id:
|
||||
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first()
|
||||
if crawl:
|
||||
Seed.objects.filter(pk=crawl).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
print(f'{self}.on_failed() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||
@@ -263,6 +277,18 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed
|
||||
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
|
||||
# Also update Crawl and Seed health stats if snapshot has a crawl
|
||||
snapshot = self.archiveresult.snapshot
|
||||
if snapshot.crawl_id:
|
||||
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first()
|
||||
if crawl:
|
||||
Seed.objects.filter(pk=crawl).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
|
||||
@skipped.enter
|
||||
def enter_skipped(self):
|
||||
print(f'{self}.on_skipped() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||
|
||||
@@ -1,14 +1,12 @@
|
||||
# Generated by Django 5.2.9 on 2025-12-24 19:54
|
||||
# Initial migration for crawls app
|
||||
# This is a new app, no previous migrations to replace
|
||||
|
||||
import archivebox.base_models.models
|
||||
import django.core.validators
|
||||
from uuid import uuid4
|
||||
from django.conf import settings
|
||||
from django.core.validators import MinValueValidator, MaxValueValidator
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
import pathlib
|
||||
import statemachine.mixins
|
||||
import uuid
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
@@ -16,50 +14,72 @@ class Migration(migrations.Migration):
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('core', '0001_initial'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Seed',
|
||||
fields=[
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('uri', models.URLField(max_length=2048)),
|
||||
('extractor', models.CharField(default='auto', max_length=32)),
|
||||
('tags_str', models.CharField(blank=True, default='', max_length=255)),
|
||||
('label', models.CharField(blank=True, default='', max_length=255)),
|
||||
('config', models.JSONField(default=dict)),
|
||||
('output_dir', models.CharField(blank=True, default='', max_length=512)),
|
||||
('notes', models.TextField(blank=True, default='')),
|
||||
('created_by', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Seed',
|
||||
'verbose_name_plural': 'Seeds',
|
||||
'unique_together': {('created_by', 'label'), ('created_by', 'uri', 'extractor')},
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Crawl',
|
||||
fields=[
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('id', models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('urls', models.TextField(blank=True, default='')),
|
||||
('config', models.JSONField(default=dict)),
|
||||
('max_depth', models.PositiveSmallIntegerField(default=0, validators=[django.core.validators.MinValueValidator(0), django.core.validators.MaxValueValidator(4)])),
|
||||
('max_depth', models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])),
|
||||
('tags_str', models.CharField(blank=True, default='', max_length=1024)),
|
||||
('persona_id', models.UUIDField(blank=True, null=True)),
|
||||
('label', models.CharField(blank=True, default='', max_length=64)),
|
||||
('notes', models.TextField(blank=True, default='')),
|
||||
('output_dir', models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/archive'))),
|
||||
('output_dir', models.CharField(blank=True, default='', max_length=512)),
|
||||
('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15)),
|
||||
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)),
|
||||
('created_by', models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
('created_by', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
('seed', models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, related_name='crawl_set', to='crawls.seed')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Crawl',
|
||||
'verbose_name_plural': 'Crawls',
|
||||
},
|
||||
bases=(models.Model, statemachine.mixins.MachineMixin),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='CrawlSchedule',
|
||||
fields=[
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('id', models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('schedule', models.CharField(max_length=64)),
|
||||
('is_enabled', models.BooleanField(default=True)),
|
||||
('label', models.CharField(blank=True, default='', max_length=64)),
|
||||
('notes', models.TextField(blank=True, default='')),
|
||||
('created_by', models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
('created_by', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
('template', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl')),
|
||||
],
|
||||
options={
|
||||
@@ -72,48 +92,4 @@ class Migration(migrations.Migration):
|
||||
name='schedule',
|
||||
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='crawls.crawlschedule'),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Seed',
|
||||
fields=[
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('id', models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('uri', models.URLField(max_length=2048)),
|
||||
('extractor', models.CharField(default='auto', max_length=32)),
|
||||
('tags_str', models.CharField(blank=True, default='', max_length=255)),
|
||||
('label', models.CharField(blank=True, default='', max_length=255)),
|
||||
('config', models.JSONField(default=dict)),
|
||||
('output_dir', models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/archive'))),
|
||||
('notes', models.TextField(blank=True, default='')),
|
||||
('created_by', models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Seed',
|
||||
'verbose_name_plural': 'Seeds',
|
||||
'unique_together': {('created_by', 'label'), ('created_by', 'uri', 'extractor')},
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='crawl',
|
||||
name='seed',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, related_name='crawl_set', to='crawls.seed'),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Outlink',
|
||||
fields=[
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('id', models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('src', models.URLField()),
|
||||
('dst', models.URLField()),
|
||||
('crawl', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='outlink_set', to='crawls.crawl')),
|
||||
('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
('via', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='outlink_set', to='core.archiveresult')),
|
||||
],
|
||||
options={
|
||||
'unique_together': {('src', 'dst', 'via')},
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
# Generated by Django 6.0 on 2025-12-25 02:19
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('crawls', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.DeleteModel(
|
||||
name='Outlink',
|
||||
),
|
||||
]
|
||||
@@ -1,140 +0,0 @@
|
||||
# Generated by Django 5.1.1 on 2024-10-02 04:34
|
||||
# Modified: Removed abid/charidfield - ABID system removed
|
||||
|
||||
import archivebox.base_models.models
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
def drop_machine_abid_fields_if_exist(apps, schema_editor):
|
||||
"""Drop abid fields from machine tables if they exist."""
|
||||
connection = schema_editor.connection
|
||||
tables_and_fields = [
|
||||
('machine_machine', 'abid'),
|
||||
('machine_networkinterface', 'abid'),
|
||||
]
|
||||
for table_name, field_name in tables_and_fields:
|
||||
with connection.cursor() as cursor:
|
||||
try:
|
||||
cursor.execute(f"PRAGMA table_info({table_name})")
|
||||
columns = [row[1] for row in cursor.fetchall()]
|
||||
if field_name in columns:
|
||||
print(f" Dropping {table_name}.{field_name}...")
|
||||
cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN {field_name}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = []
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="Machine",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.UUIDField(
|
||||
default=None,
|
||||
editable=False,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
unique=True,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
# Removed: abid field - ABID system removed
|
||||
(
|
||||
"created_at",
|
||||
archivebox.base_models.models.AutoDateTimeField(
|
||||
db_index=True, default=None
|
||||
),
|
||||
),
|
||||
("modified_at", models.DateTimeField(auto_now=True)),
|
||||
(
|
||||
"guid",
|
||||
models.CharField(
|
||||
default=None, editable=False, max_length=64, unique=True
|
||||
),
|
||||
),
|
||||
("hostname", models.CharField(default=None, max_length=63)),
|
||||
("hw_in_docker", models.BooleanField(default=False)),
|
||||
("hw_in_vm", models.BooleanField(default=False)),
|
||||
("hw_manufacturer", models.CharField(default=None, max_length=63)),
|
||||
("hw_product", models.CharField(default=None, max_length=63)),
|
||||
("hw_uuid", models.CharField(default=None, max_length=255)),
|
||||
("os_arch", models.CharField(default=None, max_length=15)),
|
||||
("os_family", models.CharField(default=None, max_length=15)),
|
||||
("os_platform", models.CharField(default=None, max_length=63)),
|
||||
("os_release", models.CharField(default=None, max_length=63)),
|
||||
("os_kernel", models.CharField(default=None, max_length=255)),
|
||||
("stats", models.JSONField(default=None)),
|
||||
],
|
||||
options={
|
||||
"abstract": False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="NetworkInterface",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.UUIDField(
|
||||
default=None,
|
||||
editable=False,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
unique=True,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
# Removed: abid field - ABID system removed
|
||||
(
|
||||
"created_at",
|
||||
archivebox.base_models.models.AutoDateTimeField(
|
||||
db_index=True, default=None
|
||||
),
|
||||
),
|
||||
("modified_at", models.DateTimeField(auto_now=True)),
|
||||
(
|
||||
"mac_address",
|
||||
models.CharField(default=None, editable=False, max_length=17),
|
||||
),
|
||||
(
|
||||
"ip_public",
|
||||
models.GenericIPAddressField(default=None, editable=False),
|
||||
),
|
||||
(
|
||||
"ip_local",
|
||||
models.GenericIPAddressField(default=None, editable=False),
|
||||
),
|
||||
(
|
||||
"dns_server",
|
||||
models.GenericIPAddressField(default=None, editable=False),
|
||||
),
|
||||
("iface", models.CharField(default=None, max_length=15)),
|
||||
("hostname", models.CharField(default=None, max_length=63)),
|
||||
("isp", models.CharField(default=None, max_length=63)),
|
||||
("city", models.CharField(default=None, max_length=63)),
|
||||
("region", models.CharField(default=None, max_length=63)),
|
||||
("country", models.CharField(default=None, max_length=63)),
|
||||
(
|
||||
"machine",
|
||||
models.ForeignKey(
|
||||
default=None,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
to="machine.machine",
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"unique_together": {
|
||||
("machine", "ip_public", "ip_local", "mac_address", "dns_server")
|
||||
},
|
||||
},
|
||||
),
|
||||
migrations.RunPython(drop_machine_abid_fields_if_exist, reverse_code=migrations.RunPython.noop),
|
||||
]
|
||||
111
archivebox/machine/migrations/0001_squashed.py
Normal file
111
archivebox/machine/migrations/0001_squashed.py
Normal file
@@ -0,0 +1,111 @@
|
||||
# Squashed migration: replaces 0001-0004
|
||||
# For fresh installs: creates final schema
|
||||
# For dev users with 0001-0004 applied: marked as applied (no-op)
|
||||
|
||||
from uuid import uuid4
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
replaces = [
|
||||
('machine', '0001_initial'),
|
||||
('machine', '0002_alter_machine_stats_installedbinary'),
|
||||
('machine', '0003_alter_installedbinary_options_and_more'),
|
||||
('machine', '0004_alter_installedbinary_abspath_and_more'),
|
||||
]
|
||||
|
||||
dependencies = []
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Machine',
|
||||
fields=[
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('guid', models.CharField(default=None, editable=False, max_length=64, unique=True)),
|
||||
('hostname', models.CharField(default=None, max_length=63)),
|
||||
('hw_in_docker', models.BooleanField(default=False)),
|
||||
('hw_in_vm', models.BooleanField(default=False)),
|
||||
('hw_manufacturer', models.CharField(default=None, max_length=63)),
|
||||
('hw_product', models.CharField(default=None, max_length=63)),
|
||||
('hw_uuid', models.CharField(default=None, max_length=255)),
|
||||
('os_arch', models.CharField(default=None, max_length=15)),
|
||||
('os_family', models.CharField(default=None, max_length=15)),
|
||||
('os_platform', models.CharField(default=None, max_length=63)),
|
||||
('os_release', models.CharField(default=None, max_length=63)),
|
||||
('os_kernel', models.CharField(default=None, max_length=255)),
|
||||
('stats', models.JSONField(default=dict)),
|
||||
('config', models.JSONField(blank=True, default=dict)),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='NetworkInterface',
|
||||
fields=[
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('mac_address', models.CharField(default=None, editable=False, max_length=17)),
|
||||
('ip_public', models.GenericIPAddressField(default=None, editable=False)),
|
||||
('ip_local', models.GenericIPAddressField(default=None, editable=False)),
|
||||
('dns_server', models.GenericIPAddressField(default=None, editable=False)),
|
||||
('hostname', models.CharField(default=None, max_length=63)),
|
||||
('iface', models.CharField(default=None, max_length=15)),
|
||||
('isp', models.CharField(default=None, max_length=63)),
|
||||
('city', models.CharField(default=None, max_length=63)),
|
||||
('region', models.CharField(default=None, max_length=63)),
|
||||
('country', models.CharField(default=None, max_length=63)),
|
||||
('machine', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
|
||||
],
|
||||
options={
|
||||
'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')},
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Dependency',
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('bin_name', models.CharField(db_index=True, max_length=63, unique=True)),
|
||||
('bin_providers', models.CharField(default='*', max_length=127)),
|
||||
('custom_cmds', models.JSONField(blank=True, default=dict)),
|
||||
('config', models.JSONField(blank=True, default=dict)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Dependency',
|
||||
'verbose_name_plural': 'Dependencies',
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='InstalledBinary',
|
||||
fields=[
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('name', models.CharField(blank=True, db_index=True, default=None, max_length=63)),
|
||||
('binprovider', models.CharField(blank=True, default=None, max_length=31)),
|
||||
('abspath', models.CharField(blank=True, default=None, max_length=255)),
|
||||
('version', models.CharField(blank=True, default=None, max_length=32)),
|
||||
('sha256', models.CharField(blank=True, default=None, max_length=64)),
|
||||
('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
|
||||
('dependency', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='installedbinary_set', to='machine.dependency')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Installed Binary',
|
||||
'verbose_name_plural': 'Installed Binaries',
|
||||
'unique_together': {('machine', 'name', 'abspath', 'version', 'sha256')},
|
||||
},
|
||||
),
|
||||
]
|
||||
@@ -1,78 +0,0 @@
|
||||
# Generated by Django 5.1.1 on 2024-10-03 07:25
|
||||
# Modified: Removed abid/charidfield - ABID system removed
|
||||
|
||||
import archivebox.base_models.models
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
def drop_installedbinary_abid_if_exist(apps, schema_editor):
|
||||
"""Drop abid field from installedbinary if it exists."""
|
||||
connection = schema_editor.connection
|
||||
with connection.cursor() as cursor:
|
||||
try:
|
||||
cursor.execute("PRAGMA table_info(machine_installedbinary)")
|
||||
columns = [row[1] for row in cursor.fetchall()]
|
||||
if 'abid' in columns:
|
||||
print(" Dropping machine_installedbinary.abid...")
|
||||
cursor.execute("ALTER TABLE machine_installedbinary DROP COLUMN abid")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("machine", "0001_initial"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="machine",
|
||||
name="stats",
|
||||
field=models.JSONField(default=dict),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="InstalledBinary",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.UUIDField(
|
||||
default=None,
|
||||
editable=False,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
unique=True,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
# Removed: abid field - ABID system removed
|
||||
(
|
||||
"created_at",
|
||||
archivebox.base_models.models.AutoDateTimeField(
|
||||
db_index=True, default=None
|
||||
),
|
||||
),
|
||||
("modified_at", models.DateTimeField(auto_now=True)),
|
||||
("name", models.CharField(default=None, max_length=63)),
|
||||
("binprovider", models.CharField(default=None, max_length=31)),
|
||||
("abspath", models.CharField(default=None, max_length=255)),
|
||||
("version", models.CharField(default=None, max_length=32)),
|
||||
("sha256", models.CharField(default=None, max_length=64)),
|
||||
(
|
||||
"machine",
|
||||
models.ForeignKey(
|
||||
default=None,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
to="machine.machine",
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"unique_together": {
|
||||
("machine", "name", "binprovider", "abspath", "version", "sha256")
|
||||
},
|
||||
},
|
||||
),
|
||||
migrations.RunPython(drop_installedbinary_abid_if_exist, reverse_code=migrations.RunPython.noop),
|
||||
]
|
||||
@@ -1,50 +0,0 @@
|
||||
# Generated by Django 5.1.1 on 2024-10-03 09:20
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("machine", "0002_alter_machine_stats_installedbinary"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name="installedbinary",
|
||||
options={
|
||||
"verbose_name": "Installed Binary",
|
||||
"verbose_name_plural": "Installed Binaries",
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="installedbinary",
|
||||
name="num_uses_failed",
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="installedbinary",
|
||||
name="num_uses_succeeded",
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="machine",
|
||||
name="num_uses_failed",
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="machine",
|
||||
name="num_uses_succeeded",
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="networkinterface",
|
||||
name="num_uses_failed",
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="networkinterface",
|
||||
name="num_uses_succeeded",
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
]
|
||||
@@ -1,49 +0,0 @@
|
||||
# Generated by Django 5.1.1 on 2024-10-03 09:50
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("machine", "0003_alter_installedbinary_options_and_more"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="installedbinary",
|
||||
name="abspath",
|
||||
field=models.CharField(blank=True, default=None, max_length=255),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="installedbinary",
|
||||
name="binprovider",
|
||||
field=models.CharField(blank=True, default=None, max_length=31),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="installedbinary",
|
||||
name="machine",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
default=None,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
to="machine.machine",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="installedbinary",
|
||||
name="name",
|
||||
field=models.CharField(blank=True, default=None, max_length=63),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="installedbinary",
|
||||
name="sha256",
|
||||
field=models.CharField(blank=True, default=None, max_length=64),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="installedbinary",
|
||||
name="version",
|
||||
field=models.CharField(blank=True, default=None, max_length=32),
|
||||
),
|
||||
]
|
||||
@@ -95,17 +95,17 @@ def check_io_encoding():
|
||||
|
||||
def check_not_root():
|
||||
from archivebox.config.permissions import IS_ROOT, IN_DOCKER
|
||||
|
||||
|
||||
attempted_command = ' '.join(sys.argv[1:]) if len(sys.argv) > 1 else ''
|
||||
is_getting_help = '-h' in sys.argv or '--help' in sys.argv or 'help' in sys.argv
|
||||
is_getting_version = '--version' in sys.argv or 'version' in sys.argv
|
||||
is_installing = 'setup' in sys.argv or 'install' in sys.argv
|
||||
|
||||
|
||||
if IS_ROOT and not (is_getting_help or is_getting_version or is_installing):
|
||||
print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr)
|
||||
print(' For more information, see the security overview documentation:', file=sys.stderr)
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr)
|
||||
|
||||
|
||||
if IN_DOCKER:
|
||||
print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr)
|
||||
print(' docker compose run archivebox {attempted_command}', file=sys.stderr)
|
||||
@@ -116,6 +116,17 @@ def check_not_root():
|
||||
raise SystemExit(2)
|
||||
|
||||
|
||||
def check_not_inside_source_dir():
|
||||
"""Prevent running ArchiveBox from inside its source directory (would pollute repo with data files)."""
|
||||
cwd = Path(os.getcwd()).resolve()
|
||||
is_source_dir = (cwd / 'archivebox' / '__init__.py').exists() and (cwd / 'pyproject.toml').exists()
|
||||
data_dir_set_elsewhere = os.environ.get('DATA_DIR', '').strip() and Path(os.environ['DATA_DIR']).resolve() != cwd
|
||||
is_testing = 'pytest' in sys.modules or 'unittest' in sys.modules
|
||||
|
||||
if is_source_dir and not data_dir_set_elsewhere and not is_testing:
|
||||
raise SystemExit('[!] Cannot run from source dir, set DATA_DIR or cd to a data folder first')
|
||||
|
||||
|
||||
def check_data_dir_permissions():
|
||||
from archivebox import DATA_DIR
|
||||
from archivebox.misc.logging import STDERR
|
||||
|
||||
61
archivebox/plugins/archive_org/tests/test_archive_org.py
Normal file
61
archivebox/plugins/archive_org/tests/test_archive_org.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""
|
||||
Integration tests for archive_org plugin
|
||||
|
||||
Tests verify standalone archive.org extractor execution.
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
ARCHIVE_ORG_HOOK = PLUGIN_DIR / 'on_Snapshot__13_archive_org.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
assert ARCHIVE_ORG_HOOK.exists()
|
||||
|
||||
def test_submits_to_archive_org():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVE_ORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir, capture_output=True, text=True, timeout=60
|
||||
)
|
||||
|
||||
assert result.returncode in (0, 1)
|
||||
assert 'RESULT_JSON=' in result.stdout
|
||||
|
||||
# Should either succeed or fail gracefully
|
||||
assert 'STATUS=' in result.stdout
|
||||
|
||||
def test_config_save_archive_org_false_skips():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['SAVE_ARCHIVE_DOT_ORG'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVE_ORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
assert 'STATUS=skipped' in result.stdout or 'STATUS=succeeded' in result.stdout
|
||||
|
||||
def test_handles_timeout():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['TIMEOUT'] = '1'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVE_ORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
|
||||
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode in (0, 1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
149
archivebox/plugins/chrome_session/on_Crawl__00_install_chrome.py
Executable file
149
archivebox/plugins/chrome_session/on_Crawl__00_install_chrome.py
Executable file
@@ -0,0 +1,149 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install Chrome/Chromium if not already available.
|
||||
|
||||
Runs at crawl start to ensure Chrome is installed.
|
||||
Uses playwright to install chromium if no system Chrome found.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def find_chrome():
|
||||
"""Try to find system Chrome/Chromium."""
|
||||
# Comprehensive list of Chrome/Chromium binary names and paths
|
||||
chromium_names_linux = [
|
||||
'chromium',
|
||||
'chromium-browser',
|
||||
'chromium-browser-beta',
|
||||
'chromium-browser-unstable',
|
||||
'chromium-browser-canary',
|
||||
'chromium-browser-dev',
|
||||
]
|
||||
|
||||
chrome_names_linux = [
|
||||
'google-chrome',
|
||||
'google-chrome-stable',
|
||||
'google-chrome-beta',
|
||||
'google-chrome-canary',
|
||||
'google-chrome-unstable',
|
||||
'google-chrome-dev',
|
||||
'chrome',
|
||||
]
|
||||
|
||||
chrome_paths_macos = [
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
]
|
||||
|
||||
chrome_paths_linux = [
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
'/snap/bin/chromium',
|
||||
'/opt/google/chrome/chrome',
|
||||
]
|
||||
|
||||
all_chrome_names = chrome_names_linux + chromium_names_linux
|
||||
all_chrome_paths = chrome_paths_macos + chrome_paths_linux
|
||||
|
||||
# Check env var first
|
||||
env_path = os.environ.get('CHROME_BINARY', '')
|
||||
if env_path and Path(env_path).is_file():
|
||||
return env_path
|
||||
|
||||
# Try shutil.which for various names
|
||||
for name in all_chrome_names:
|
||||
abspath = shutil.which(name)
|
||||
if abspath:
|
||||
return abspath
|
||||
|
||||
# Check common paths
|
||||
for path in all_chrome_paths:
|
||||
if Path(path).is_file():
|
||||
return path
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
# First try to find system Chrome
|
||||
system_chrome = find_chrome()
|
||||
if system_chrome:
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'chrome',
|
||||
'abspath': str(system_chrome),
|
||||
'version': None,
|
||||
'sha256': None,
|
||||
'binprovider': 'env',
|
||||
}))
|
||||
sys.exit(0)
|
||||
|
||||
# If not found in system, try to install chromium via apt/brew
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Try chromium-browser or chromium via system package managers
|
||||
for binary_name in ['chromium', 'chromium-browser', 'google-chrome']:
|
||||
try:
|
||||
chrome_binary = Binary(
|
||||
name=binary_name,
|
||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = chrome_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via system package manager
|
||||
loaded = chrome_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'chrome',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# If all attempts failed
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'chrome',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print("Failed to install Chrome/Chromium", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'chrome',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print(f"Error installing Chrome: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
0
archivebox/plugins/chrome_session/tests/__init__.py
Normal file
0
archivebox/plugins/chrome_session/tests/__init__.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""
|
||||
Integration tests for chrome_session plugin
|
||||
|
||||
Tests verify:
|
||||
1. Install hook finds system Chrome or installs chromium
|
||||
2. Verify deps with abx-pkg
|
||||
3. Chrome session script exists
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
CHROME_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_chrome.py'
|
||||
CHROME_SESSION_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_session.js'
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify chrome session hook exists."""
|
||||
assert CHROME_SESSION_HOOK.exists(), f"Hook not found: {CHROME_SESSION_HOOK}"
|
||||
|
||||
|
||||
def test_chrome_install_hook():
|
||||
"""Test chrome install hook to find or install Chrome/Chromium."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'chrome'
|
||||
assert record['abspath']
|
||||
assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify chrome is available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Try various chrome binary names
|
||||
for binary_name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
|
||||
try:
|
||||
chrome_binary = Binary(
|
||||
name=binary_name,
|
||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
||||
)
|
||||
chrome_loaded = chrome_binary.load()
|
||||
if chrome_loaded and chrome_loaded.abspath:
|
||||
# Found at least one chrome variant
|
||||
assert Path(chrome_loaded.abspath).exists()
|
||||
return
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# If we get here, chrome should still be available from system
|
||||
import shutil
|
||||
assert shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome'), \
|
||||
"Chrome should be available after install hook"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
205
archivebox/plugins/dom/tests/test_dom.py
Normal file
205
archivebox/plugins/dom/tests/test_dom.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""
|
||||
Integration tests for dom plugin
|
||||
|
||||
Tests verify:
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via chrome_session validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
4. DOM extraction works on https://example.com
|
||||
5. JSONL output is correct
|
||||
6. Filesystem output contains actual page content
|
||||
7. Config options work
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
DOM_HOOK = PLUGIN_DIR / 'on_Snapshot__36_dom.js'
|
||||
CHROME_VALIDATE_HOOK = PLUGINS_ROOT / 'chrome_session' / 'on_Crawl__00_validate_chrome.py'
|
||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Dependency__install_using_npm_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert DOM_HOOK.exists(), f"Hook not found: {DOM_HOOK}"
|
||||
|
||||
|
||||
def test_chrome_validation_and_install():
|
||||
"""Test chrome validation hook to install puppeteer-core if needed."""
|
||||
# Run chrome validation hook (from chrome_session plugin)
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(CHROME_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# If exit 1, binary not found - need to install
|
||||
if result.returncode == 1:
|
||||
# Parse Dependency request from JSONL
|
||||
dependency_request = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
dependency_request = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if dependency_request:
|
||||
bin_name = dependency_request['bin_name']
|
||||
bin_providers = dependency_request['bin_providers']
|
||||
|
||||
# Install via npm provider hook
|
||||
install_result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(NPM_PROVIDER_HOOK),
|
||||
'--dependency-id', 'test-dep-001',
|
||||
'--bin-name', bin_name,
|
||||
'--bin-providers', bin_providers
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
assert install_result.returncode == 0, f"Install failed: {install_result.stderr}"
|
||||
|
||||
# Verify installation via JSONL output
|
||||
for line in install_result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == bin_name
|
||||
assert record['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
else:
|
||||
# Binary already available, verify via JSONL output
|
||||
assert result.returncode == 0, f"Validation failed: {result.stderr}"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify dependencies are available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
||||
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Verify node is available
|
||||
node_binary = Binary(name='node', binproviders=[EnvProvider()])
|
||||
node_loaded = node_binary.load()
|
||||
assert node_loaded and node_loaded.abspath, "Node.js required for dom plugin"
|
||||
|
||||
|
||||
def test_extracts_dom_from_example_com():
|
||||
"""Test full workflow: extract DOM from real example.com via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run DOM extraction hook
|
||||
result = subprocess.run(
|
||||
['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Verify JSONL output
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
|
||||
|
||||
# Parse JSONL result
|
||||
result_json = None
|
||||
for line in result.stdout.split('\n'):
|
||||
if line.startswith('RESULT_JSON='):
|
||||
result_json = json.loads(line.split('=', 1)[1])
|
||||
break
|
||||
|
||||
assert result_json, "Should have RESULT_JSON"
|
||||
assert result_json['extractor'] == 'dom'
|
||||
assert result_json['status'] == 'succeeded'
|
||||
assert result_json['url'] == TEST_URL
|
||||
|
||||
# Verify filesystem output
|
||||
dom_dir = tmpdir / 'dom'
|
||||
assert dom_dir.exists(), "Output directory not created"
|
||||
|
||||
dom_file = dom_dir / 'output.html'
|
||||
assert dom_file.exists(), "output.html not created"
|
||||
|
||||
# Verify HTML content contains REAL example.com text
|
||||
html_content = dom_file.read_text(errors='ignore')
|
||||
assert len(html_content) > 200, f"HTML content too short: {len(html_content)} bytes"
|
||||
assert '<html' in html_content.lower(), "Missing <html> tag"
|
||||
assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
|
||||
assert ('this domain' in html_content.lower() or
|
||||
'illustrative examples' in html_content.lower()), \
|
||||
"Missing example.com description text"
|
||||
|
||||
|
||||
def test_config_save_dom_false_skips():
|
||||
"""Test that SAVE_DOM=False causes skip."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = os.environ.copy()
|
||||
env['SAVE_DOM'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
|
||||
assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
|
||||
|
||||
|
||||
def test_staticfile_present_skips():
|
||||
"""Test that dom skips when staticfile already downloaded."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Create staticfile directory to simulate staticfile extractor ran
|
||||
staticfile_dir = tmpdir / 'staticfile'
|
||||
staticfile_dir.mkdir()
|
||||
(staticfile_dir / 'index.html').write_text('<html>test</html>')
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=teststatic'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, "Should exit 0 when skipping"
|
||||
assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
|
||||
assert 'staticfile' in result.stdout.lower(), "Should mention staticfile"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
68
archivebox/plugins/git/on_Crawl__00_install_git.py
Executable file
68
archivebox/plugins/git/on_Crawl__00_install_git.py
Executable file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install git if not already available.
|
||||
|
||||
Runs at crawl start to ensure git is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# git binary and package have same name
|
||||
git_binary = Binary(
|
||||
name='git',
|
||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = git_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via system package manager
|
||||
loaded = git_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'git',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'git',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print("Failed to install git", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'git',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print(f"Error installing git: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
90
archivebox/plugins/git/tests/test_git.py
Normal file
90
archivebox/plugins/git/tests/test_git.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
Integration tests for git plugin
|
||||
|
||||
Tests verify:
|
||||
1. Install hook installs git via abx-pkg
|
||||
2. Verify deps with abx-pkg
|
||||
3. Standalone git extractor execution
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
GIT_HOOK = PLUGIN_DIR / 'on_Snapshot__12_git.py'
|
||||
GIT_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_git.py'
|
||||
TEST_URL = 'https://github.com/example/repo.git'
|
||||
|
||||
def test_hook_script_exists():
|
||||
assert GIT_HOOK.exists()
|
||||
|
||||
def test_git_install_hook():
|
||||
"""Test git install hook to install git if needed."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GIT_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'git'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify git is available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
git_loaded = git_binary.load()
|
||||
assert git_loaded and git_loaded.abspath, "git should be available after install hook"
|
||||
|
||||
def test_reports_missing_git():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = {'PATH': '/nonexistent'}
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GIT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
|
||||
cwd=tmpdir, capture_output=True, text=True, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'DEPENDENCY_NEEDED' in combined or 'git' in combined.lower() or 'ERROR=' in combined
|
||||
|
||||
def test_handles_non_git_url():
|
||||
if not shutil.which('git'):
|
||||
pytest.skip("git not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GIT_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir, capture_output=True, text=True, timeout=30
|
||||
)
|
||||
# Should fail or skip for non-git URL
|
||||
assert result.returncode in (0, 1)
|
||||
assert 'STATUS=' in result.stdout
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
53
archivebox/plugins/htmltotext/tests/test_htmltotext.py
Normal file
53
archivebox/plugins/htmltotext/tests/test_htmltotext.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""
|
||||
Integration tests for htmltotext plugin
|
||||
|
||||
Tests verify standalone htmltotext extractor execution.
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
HTMLTOTEXT_HOOK = PLUGIN_DIR / 'on_Snapshot__54_htmltotext.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
assert HTMLTOTEXT_HOOK.exists()
|
||||
|
||||
def test_extracts_text_from_html():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
# Create HTML source
|
||||
(tmpdir / 'singlefile').mkdir()
|
||||
(tmpdir / 'singlefile' / 'singlefile.html').write_text('<html><body><h1>Example Domain</h1><p>This domain is for examples.</p></body></html>')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(HTMLTOTEXT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir, capture_output=True, text=True, timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode in (0, 1)
|
||||
assert 'RESULT_JSON=' in result.stdout
|
||||
|
||||
if result.returncode == 0:
|
||||
assert 'STATUS=succeeded' in result.stdout
|
||||
output_file = tmpdir / 'htmltotext' / 'content.txt'
|
||||
if output_file.exists():
|
||||
content = output_file.read_text()
|
||||
assert len(content) > 0
|
||||
|
||||
def test_fails_gracefully_without_html():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(HTMLTOTEXT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir, capture_output=True, text=True, timeout=30
|
||||
)
|
||||
assert result.returncode in (0, 1)
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'STATUS=' in combined
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
67
archivebox/plugins/media/on_Crawl__00_install_ytdlp.py
Executable file
67
archivebox/plugins/media/on_Crawl__00_install_ytdlp.py
Executable file
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install yt-dlp if not already available.
|
||||
|
||||
Runs at crawl start to ensure yt-dlp is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
PipProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# yt-dlp binary and package have same name
|
||||
ytdlp_binary = Binary(
|
||||
name='yt-dlp',
|
||||
binproviders=[PipProvider(), EnvProvider()]
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = ytdlp_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via pip
|
||||
loaded = ytdlp_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'yt-dlp',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'yt-dlp',
|
||||
'bin_providers': 'pip,brew,env',
|
||||
}))
|
||||
print("Failed to install yt-dlp", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'yt-dlp',
|
||||
'bin_providers': 'pip,brew,env',
|
||||
}))
|
||||
print(f"Error installing yt-dlp: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
148
archivebox/plugins/media/tests/test_media.py
Normal file
148
archivebox/plugins/media/tests/test_media.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""
|
||||
Integration tests for media plugin
|
||||
|
||||
Tests verify:
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
4. Media extraction works on video URLs
|
||||
5. JSONL output is correct
|
||||
6. Config options work
|
||||
7. Handles non-media URLs gracefully
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
MEDIA_HOOK = PLUGIN_DIR / 'on_Snapshot__51_media.py'
|
||||
MEDIA_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_ytdlp.py'
|
||||
TEST_URL = 'https://example.com/video.mp4'
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert MEDIA_HOOK.exists(), f"Hook not found: {MEDIA_HOOK}"
|
||||
|
||||
|
||||
def test_ytdlp_install_hook():
|
||||
"""Test yt-dlp install hook to install yt-dlp if needed."""
|
||||
# Run yt-dlp install hook
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MEDIA_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'yt-dlp'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify yt-dlp is available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
PipProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Verify yt-dlp is available
|
||||
ytdlp_binary = Binary(name='yt-dlp', binproviders=[PipProvider(), EnvProvider()])
|
||||
ytdlp_loaded = ytdlp_binary.load()
|
||||
assert ytdlp_loaded and ytdlp_loaded.abspath, "yt-dlp should be available after install hook"
|
||||
|
||||
def test_handles_non_media_url():
|
||||
"""Test that media extractor handles non-media URLs gracefully via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run media extraction hook on non-media URL
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MEDIA_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should exit 0 even for non-media URL
|
||||
assert result.returncode == 0, f"Should handle non-media URL gracefully: {result.stderr}"
|
||||
|
||||
# Verify JSONL output
|
||||
assert 'STATUS=' in result.stdout, "Should report status"
|
||||
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
|
||||
|
||||
# Parse JSONL result
|
||||
result_json = None
|
||||
for line in result.stdout.split('\n'):
|
||||
if line.startswith('RESULT_JSON='):
|
||||
result_json = json.loads(line.split('=', 1)[1])
|
||||
break
|
||||
|
||||
assert result_json, "Should have RESULT_JSON"
|
||||
assert result_json['extractor'] == 'media'
|
||||
|
||||
|
||||
def test_config_save_media_false_skips():
|
||||
"""Test that SAVE_MEDIA=False causes skip."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['SAVE_MEDIA'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MEDIA_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
|
||||
assert 'STATUS=' in result.stdout
|
||||
|
||||
|
||||
def test_config_timeout():
|
||||
"""Test that MEDIA_TIMEOUT config is respected."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['MEDIA_TIMEOUT'] = '5'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MEDIA_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, "Should complete without hanging"
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
68
archivebox/plugins/mercury/on_Crawl__00_install_mercury.py
Executable file
68
archivebox/plugins/mercury/on_Crawl__00_install_mercury.py
Executable file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install mercury-parser if not already available.
|
||||
|
||||
Runs at crawl start to ensure mercury-parser is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
NpmProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Note: npm package is @postlight/mercury-parser, binary is mercury-parser
|
||||
mercury_binary = Binary(
|
||||
name='mercury-parser',
|
||||
binproviders=[NpmProvider(), EnvProvider()],
|
||||
overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = mercury_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via npm
|
||||
loaded = mercury_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'mercury-parser',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'mercury-parser',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print("Failed to install mercury-parser", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'mercury-parser',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print(f"Error installing mercury-parser: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
164
archivebox/plugins/mercury/tests/test_mercury.py
Normal file
164
archivebox/plugins/mercury/tests/test_mercury.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
Integration tests for mercury plugin
|
||||
|
||||
Tests verify:
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
4. Mercury extraction works on https://example.com
|
||||
5. JSONL output is correct
|
||||
6. Filesystem output contains extracted content
|
||||
7. Config options work
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
MERCURY_HOOK = PLUGIN_DIR / 'on_Snapshot__53_mercury.py'
|
||||
MERCURY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_mercury.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}"
|
||||
|
||||
|
||||
def test_mercury_install_hook():
|
||||
"""Test mercury install hook to install mercury-parser if needed."""
|
||||
# Run mercury install hook
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MERCURY_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'mercury-parser'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify mercury-parser is available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
NpmProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Verify mercury-parser is available
|
||||
mercury_binary = Binary(
|
||||
name='mercury-parser',
|
||||
binproviders=[NpmProvider(), EnvProvider()],
|
||||
overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
|
||||
)
|
||||
mercury_loaded = mercury_binary.load()
|
||||
assert mercury_loaded and mercury_loaded.abspath, "mercury-parser should be available after install hook"
|
||||
|
||||
def test_extracts_with_mercury_parser():
|
||||
"""Test full workflow: extract with mercury-parser from real HTML via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Create HTML source that mercury can parse
|
||||
(tmpdir / 'singlefile').mkdir()
|
||||
(tmpdir / 'singlefile' / 'singlefile.html').write_text(
|
||||
'<html><head><title>Test Article</title></head><body>'
|
||||
'<article><h1>Example Article</h1><p>This is test content for mercury parser.</p></article>'
|
||||
'</body></html>'
|
||||
)
|
||||
|
||||
# Run mercury extraction hook
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Verify JSONL output
|
||||
assert 'STATUS=' in result.stdout, "Should report status"
|
||||
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
|
||||
|
||||
# Parse JSONL result
|
||||
result_json = None
|
||||
for line in result.stdout.split('\n'):
|
||||
if line.startswith('RESULT_JSON='):
|
||||
result_json = json.loads(line.split('=', 1)[1])
|
||||
break
|
||||
|
||||
assert result_json, "Should have RESULT_JSON"
|
||||
assert result_json['extractor'] == 'mercury'
|
||||
|
||||
# Verify filesystem output if extraction succeeded
|
||||
if result_json['status'] == 'succeeded':
|
||||
mercury_dir = tmpdir / 'mercury'
|
||||
assert mercury_dir.exists(), "Output directory not created"
|
||||
|
||||
output_file = mercury_dir / 'content.html'
|
||||
assert output_file.exists(), "content.html not created"
|
||||
|
||||
content = output_file.read_text()
|
||||
assert len(content) > 0, "Output should not be empty"
|
||||
|
||||
def test_config_save_mercury_false_skips():
|
||||
"""Test that SAVE_MERCURY=False causes skip."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['SAVE_MERCURY'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
|
||||
assert 'STATUS=' in result.stdout
|
||||
|
||||
|
||||
def test_fails_gracefully_without_html():
|
||||
"""Test that mercury fails gracefully when no HTML source exists."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, "Should exit 0 even when no HTML source"
|
||||
assert 'STATUS=' in result.stdout
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
925
archivebox/plugins/package-lock.json
generated
Normal file
925
archivebox/plugins/package-lock.json
generated
Normal file
@@ -0,0 +1,925 @@
|
||||
{
|
||||
"name": "archivebox-plugins",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "archivebox-plugins",
|
||||
"dependencies": {
|
||||
"puppeteer-core": "^24.34.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@puppeteer/browsers": {
|
||||
"version": "2.11.0",
|
||||
"resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.11.0.tgz",
|
||||
"integrity": "sha512-n6oQX6mYkG8TRPuPXmbPidkUbsSRalhmaaVAQxvH1IkQy63cwsH+kOjB3e4cpCDHg0aSvsiX9bQ4s2VB6mGWUQ==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"debug": "^4.4.3",
|
||||
"extract-zip": "^2.0.1",
|
||||
"progress": "^2.0.3",
|
||||
"proxy-agent": "^6.5.0",
|
||||
"semver": "^7.7.3",
|
||||
"tar-fs": "^3.1.1",
|
||||
"yargs": "^17.7.2"
|
||||
},
|
||||
"bin": {
|
||||
"browsers": "lib/cjs/main-cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/@tootallnate/quickjs-emscripten": {
|
||||
"version": "0.23.0",
|
||||
"resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
|
||||
"integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/node": {
|
||||
"version": "25.0.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-25.0.3.tgz",
|
||||
"integrity": "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"undici-types": "~7.16.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/yauzl": {
|
||||
"version": "2.10.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/yauzl/-/yauzl-2.10.3.tgz",
|
||||
"integrity": "sha512-oJoftv0LSuaDZE3Le4DbKX+KS9G36NzOeSap90UIK0yMA/NhKJhqlSGtNDORNRaIbQfzjXDrQa0ytJ6mNRGz/Q==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/agent-base": {
|
||||
"version": "7.1.4",
|
||||
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
|
||||
"integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/ansi-regex": {
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
|
||||
"integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/ansi-styles": {
|
||||
"version": "4.3.0",
|
||||
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
|
||||
"integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"color-convert": "^2.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/ast-types": {
|
||||
"version": "0.13.4",
|
||||
"resolved": "https://registry.npmjs.org/ast-types/-/ast-types-0.13.4.tgz",
|
||||
"integrity": "sha512-x1FCFnFifvYDDzTaLII71vG5uvDwgtmDTEVWAxrgeiR8VjMONcCXJx7E+USjDtHlwFmt9MysbqgF9b9Vjr6w+w==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"tslib": "^2.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/b4a": {
|
||||
"version": "1.7.3",
|
||||
"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.7.3.tgz",
|
||||
"integrity": "sha512-5Q2mfq2WfGuFp3uS//0s6baOJLMoVduPYVeNmDYxu5OUA1/cBfvr2RIS7vi62LdNj/urk1hfmj867I3qt6uZ7Q==",
|
||||
"license": "Apache-2.0",
|
||||
"peerDependencies": {
|
||||
"react-native-b4a": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"react-native-b4a": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-events": {
|
||||
"version": "2.8.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.8.2.tgz",
|
||||
"integrity": "sha512-riJjyv1/mHLIPX4RwiK+oW9/4c3TEUeORHKefKAKnZ5kyslbN+HXowtbaVEqt4IMUB7OXlfixcs6gsFeo/jhiQ==",
|
||||
"license": "Apache-2.0",
|
||||
"peerDependencies": {
|
||||
"bare-abort-controller": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bare-abort-controller": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-fs": {
|
||||
"version": "4.5.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.5.2.tgz",
|
||||
"integrity": "sha512-veTnRzkb6aPHOvSKIOy60KzURfBdUflr5VReI+NSaPL6xf+XLdONQgZgpYvUuZLVQ8dCqxpBAudaOM1+KpAUxw==",
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-events": "^2.5.4",
|
||||
"bare-path": "^3.0.0",
|
||||
"bare-stream": "^2.6.4",
|
||||
"bare-url": "^2.2.2",
|
||||
"fast-fifo": "^1.3.2"
|
||||
},
|
||||
"engines": {
|
||||
"bare": ">=1.16.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bare-buffer": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bare-buffer": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-os": {
|
||||
"version": "3.6.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.6.2.tgz",
|
||||
"integrity": "sha512-T+V1+1srU2qYNBmJCXZkUY5vQ0B4FSlL3QDROnKQYOqeiQR8UbjNHlPa+TIbM4cuidiN9GaTaOZgSEgsvPbh5A==",
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"bare": ">=1.14.0"
|
||||
}
|
||||
},
|
||||
"node_modules/bare-path": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz",
|
||||
"integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==",
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-os": "^3.0.1"
|
||||
}
|
||||
},
|
||||
"node_modules/bare-stream": {
|
||||
"version": "2.7.0",
|
||||
"resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.7.0.tgz",
|
||||
"integrity": "sha512-oyXQNicV1y8nc2aKffH+BUHFRXmx6VrPzlnaEvMhram0nPBrKcEdcyBg5r08D0i8VxngHFAiVyn1QKXpSG0B8A==",
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"streamx": "^2.21.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bare-buffer": "*",
|
||||
"bare-events": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bare-buffer": {
|
||||
"optional": true
|
||||
},
|
||||
"bare-events": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-url": {
|
||||
"version": "2.3.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.3.2.tgz",
|
||||
"integrity": "sha512-ZMq4gd9ngV5aTMa5p9+UfY0b3skwhHELaDkhEHetMdX0LRkW9kzaym4oo/Eh+Ghm0CCDuMTsRIGM/ytUc1ZYmw==",
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-path": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/basic-ftp": {
|
||||
"version": "5.0.5",
|
||||
"resolved": "https://registry.npmjs.org/basic-ftp/-/basic-ftp-5.0.5.tgz",
|
||||
"integrity": "sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/buffer-crc32": {
|
||||
"version": "0.2.13",
|
||||
"resolved": "https://registry.npmjs.org/buffer-crc32/-/buffer-crc32-0.2.13.tgz",
|
||||
"integrity": "sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/chromium-bidi": {
|
||||
"version": "12.0.1",
|
||||
"resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-12.0.1.tgz",
|
||||
"integrity": "sha512-fGg+6jr0xjQhzpy5N4ErZxQ4wF7KLEvhGZXD6EgvZKDhu7iOhZXnZhcDxPJDcwTcrD48NPzOCo84RP2lv3Z+Cg==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"mitt": "^3.0.1",
|
||||
"zod": "^3.24.1"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"devtools-protocol": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/cliui": {
|
||||
"version": "8.0.1",
|
||||
"resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz",
|
||||
"integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"string-width": "^4.2.0",
|
||||
"strip-ansi": "^6.0.1",
|
||||
"wrap-ansi": "^7.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/color-convert": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
|
||||
"integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"color-name": "~1.1.4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=7.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/color-name": {
|
||||
"version": "1.1.4",
|
||||
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
|
||||
"integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/data-uri-to-buffer": {
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
||||
"integrity": "sha512-7hvf7/GW8e86rW0ptuwS3OcBGDjIi6SZva7hCyWC0yYry2cOPmLIjXAUHI6DK2HsnwJd9ifmt57i8eV2n4YNpw==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/debug": {
|
||||
"version": "4.4.3",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
|
||||
"integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ms": "^2.1.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"supports-color": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/degenerator": {
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/degenerator/-/degenerator-5.0.1.tgz",
|
||||
"integrity": "sha512-TllpMR/t0M5sqCXfj85i4XaAzxmS5tVA16dqvdkMwGmzI+dXLXnw3J+3Vdv7VKw+ThlTMboK6i9rnZ6Nntj5CQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ast-types": "^0.13.4",
|
||||
"escodegen": "^2.1.0",
|
||||
"esprima": "^4.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/devtools-protocol": {
|
||||
"version": "0.0.1534754",
|
||||
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1534754.tgz",
|
||||
"integrity": "sha512-26T91cV5dbOYnXdJi5qQHoTtUoNEqwkHcAyu/IKtjIAxiEqPMrDiRkDOPWVsGfNZGmlQVHQbZRSjD8sxagWVsQ==",
|
||||
"license": "BSD-3-Clause",
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/emoji-regex": {
|
||||
"version": "8.0.0",
|
||||
"resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
|
||||
"integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/end-of-stream": {
|
||||
"version": "1.4.5",
|
||||
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
|
||||
"integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"once": "^1.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/escalade": {
|
||||
"version": "3.2.0",
|
||||
"resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
|
||||
"integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/escodegen": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/escodegen/-/escodegen-2.1.0.tgz",
|
||||
"integrity": "sha512-2NlIDTwUWJN0mRPQOdtQBzbUHvdGY2P1VXSyU83Q3xKxM7WHX2Ql8dKq782Q9TgQUNOLEzEYu9bzLNj1q88I5w==",
|
||||
"license": "BSD-2-Clause",
|
||||
"dependencies": {
|
||||
"esprima": "^4.0.1",
|
||||
"estraverse": "^5.2.0",
|
||||
"esutils": "^2.0.2"
|
||||
},
|
||||
"bin": {
|
||||
"escodegen": "bin/escodegen.js",
|
||||
"esgenerate": "bin/esgenerate.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"source-map": "~0.6.1"
|
||||
}
|
||||
},
|
||||
"node_modules/esprima": {
|
||||
"version": "4.0.1",
|
||||
"resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz",
|
||||
"integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==",
|
||||
"license": "BSD-2-Clause",
|
||||
"bin": {
|
||||
"esparse": "bin/esparse.js",
|
||||
"esvalidate": "bin/esvalidate.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/estraverse": {
|
||||
"version": "5.3.0",
|
||||
"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
|
||||
"integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
|
||||
"license": "BSD-2-Clause",
|
||||
"engines": {
|
||||
"node": ">=4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/esutils": {
|
||||
"version": "2.0.3",
|
||||
"resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
|
||||
"integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
|
||||
"license": "BSD-2-Clause",
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/events-universal": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/events-universal/-/events-universal-1.0.1.tgz",
|
||||
"integrity": "sha512-LUd5euvbMLpwOF8m6ivPCbhQeSiYVNb8Vs0fQ8QjXo0JTkEHpz8pxdQf0gStltaPpw0Cca8b39KxvK9cfKRiAw==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"bare-events": "^2.7.0"
|
||||
}
|
||||
},
|
||||
"node_modules/extract-zip": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-2.0.1.tgz",
|
||||
"integrity": "sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==",
|
||||
"license": "BSD-2-Clause",
|
||||
"dependencies": {
|
||||
"debug": "^4.1.1",
|
||||
"get-stream": "^5.1.0",
|
||||
"yauzl": "^2.10.0"
|
||||
},
|
||||
"bin": {
|
||||
"extract-zip": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 10.17.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@types/yauzl": "^2.9.1"
|
||||
}
|
||||
},
|
||||
"node_modules/fast-fifo": {
|
||||
"version": "1.3.2",
|
||||
"resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz",
|
||||
"integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/fd-slicer": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz",
|
||||
"integrity": "sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"pend": "~1.2.0"
|
||||
}
|
||||
},
|
||||
"node_modules/get-caller-file": {
|
||||
"version": "2.0.5",
|
||||
"resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
|
||||
"integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
|
||||
"license": "ISC",
|
||||
"engines": {
|
||||
"node": "6.* || 8.* || >= 10.*"
|
||||
}
|
||||
},
|
||||
"node_modules/get-stream": {
|
||||
"version": "5.2.0",
|
||||
"resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
|
||||
"integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"pump": "^3.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/get-uri": {
|
||||
"version": "6.0.5",
|
||||
"resolved": "https://registry.npmjs.org/get-uri/-/get-uri-6.0.5.tgz",
|
||||
"integrity": "sha512-b1O07XYq8eRuVzBNgJLstU6FYc1tS6wnMtF1I1D9lE8LxZSOGZ7LhxN54yPP6mGw5f2CkXY2BQUL9Fx41qvcIg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"basic-ftp": "^5.0.2",
|
||||
"data-uri-to-buffer": "^6.0.2",
|
||||
"debug": "^4.3.4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/http-proxy-agent": {
|
||||
"version": "7.0.2",
|
||||
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz",
|
||||
"integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"agent-base": "^7.1.0",
|
||||
"debug": "^4.3.4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/https-proxy-agent": {
|
||||
"version": "7.0.6",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz",
|
||||
"integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"agent-base": "^7.1.2",
|
||||
"debug": "4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/ip-address": {
|
||||
"version": "10.1.0",
|
||||
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
|
||||
"integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 12"
|
||||
}
|
||||
},
|
||||
"node_modules/is-fullwidth-code-point": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
|
||||
"integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/lru-cache": {
|
||||
"version": "7.18.3",
|
||||
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-7.18.3.tgz",
|
||||
"integrity": "sha512-jumlc0BIUrS3qJGgIkWZsyfAM7NCWiBcCDhnd+3NNM5KbBmLTgHVfWBcg6W+rLUsIpzpERPsvwUP7CckAQSOoA==",
|
||||
"license": "ISC",
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/mitt": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz",
|
||||
"integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/ms": {
|
||||
"version": "2.1.3",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
|
||||
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/netmask": {
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/netmask/-/netmask-2.0.2.tgz",
|
||||
"integrity": "sha512-dBpDMdxv9Irdq66304OLfEmQ9tbNRFnFTuZiLo+bD+r332bBmMJ8GBLXklIXXgxd3+v9+KUnZaUR5PJMa75Gsg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/once": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
|
||||
"integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"wrappy": "1"
|
||||
}
|
||||
},
|
||||
"node_modules/pac-proxy-agent": {
|
||||
"version": "7.2.0",
|
||||
"resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz",
|
||||
"integrity": "sha512-TEB8ESquiLMc0lV8vcd5Ql/JAKAoyzHFXaStwjkzpOpC5Yv+pIzLfHvjTSdf3vpa2bMiUQrg9i6276yn8666aA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@tootallnate/quickjs-emscripten": "^0.23.0",
|
||||
"agent-base": "^7.1.2",
|
||||
"debug": "^4.3.4",
|
||||
"get-uri": "^6.0.1",
|
||||
"http-proxy-agent": "^7.0.0",
|
||||
"https-proxy-agent": "^7.0.6",
|
||||
"pac-resolver": "^7.0.1",
|
||||
"socks-proxy-agent": "^8.0.5"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/pac-resolver": {
|
||||
"version": "7.0.1",
|
||||
"resolved": "https://registry.npmjs.org/pac-resolver/-/pac-resolver-7.0.1.tgz",
|
||||
"integrity": "sha512-5NPgf87AT2STgwa2ntRMr45jTKrYBGkVU36yT0ig/n/GMAa3oPqhZfIQ2kMEimReg0+t9kZViDVZ83qfVUlckg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"degenerator": "^5.0.0",
|
||||
"netmask": "^2.0.2"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/pend": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
|
||||
"integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/progress": {
|
||||
"version": "2.0.3",
|
||||
"resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz",
|
||||
"integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=0.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/proxy-agent": {
|
||||
"version": "6.5.0",
|
||||
"resolved": "https://registry.npmjs.org/proxy-agent/-/proxy-agent-6.5.0.tgz",
|
||||
"integrity": "sha512-TmatMXdr2KlRiA2CyDu8GqR8EjahTG3aY3nXjdzFyoZbmB8hrBsTyMezhULIXKnC0jpfjlmiZ3+EaCzoInSu/A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"agent-base": "^7.1.2",
|
||||
"debug": "^4.3.4",
|
||||
"http-proxy-agent": "^7.0.1",
|
||||
"https-proxy-agent": "^7.0.6",
|
||||
"lru-cache": "^7.14.1",
|
||||
"pac-proxy-agent": "^7.1.0",
|
||||
"proxy-from-env": "^1.1.0",
|
||||
"socks-proxy-agent": "^8.0.5"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/proxy-from-env": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz",
|
||||
"integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/pump": {
|
||||
"version": "3.0.3",
|
||||
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.3.tgz",
|
||||
"integrity": "sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"end-of-stream": "^1.1.0",
|
||||
"once": "^1.3.1"
|
||||
}
|
||||
},
|
||||
"node_modules/puppeteer-core": {
|
||||
"version": "24.34.0",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-24.34.0.tgz",
|
||||
"integrity": "sha512-24evawO+mUGW4mvS2a2ivwLdX3gk8zRLZr9HP+7+VT2vBQnm0oh9jJEZmUE3ePJhRkYlZ93i7OMpdcoi2qNCLg==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"@puppeteer/browsers": "2.11.0",
|
||||
"chromium-bidi": "12.0.1",
|
||||
"debug": "^4.4.3",
|
||||
"devtools-protocol": "0.0.1534754",
|
||||
"typed-query-selector": "^2.12.0",
|
||||
"webdriver-bidi-protocol": "0.3.10",
|
||||
"ws": "^8.18.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/require-directory": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
|
||||
"integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/semver": {
|
||||
"version": "7.7.3",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
|
||||
"integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==",
|
||||
"license": "ISC",
|
||||
"bin": {
|
||||
"semver": "bin/semver.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/smart-buffer": {
|
||||
"version": "4.2.0",
|
||||
"resolved": "https://registry.npmjs.org/smart-buffer/-/smart-buffer-4.2.0.tgz",
|
||||
"integrity": "sha512-94hK0Hh8rPqQl2xXc3HsaBoOXKV20MToPkcXvwbISWLEs+64sBq5kFgn2kJDHb1Pry9yrP0dxrCI9RRci7RXKg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 6.0.0",
|
||||
"npm": ">= 3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/socks": {
|
||||
"version": "2.8.7",
|
||||
"resolved": "https://registry.npmjs.org/socks/-/socks-2.8.7.tgz",
|
||||
"integrity": "sha512-HLpt+uLy/pxB+bum/9DzAgiKS8CX1EvbWxI4zlmgGCExImLdiad2iCwXT5Z4c9c3Eq8rP2318mPW2c+QbtjK8A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ip-address": "^10.0.1",
|
||||
"smart-buffer": "^4.2.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 10.0.0",
|
||||
"npm": ">= 3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/socks-proxy-agent": {
|
||||
"version": "8.0.5",
|
||||
"resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-8.0.5.tgz",
|
||||
"integrity": "sha512-HehCEsotFqbPW9sJ8WVYB6UbmIMv7kUUORIF2Nncq4VQvBfNBLibW9YZR5dlYCSUhwcD628pRllm7n+E+YTzJw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"agent-base": "^7.1.2",
|
||||
"debug": "^4.3.4",
|
||||
"socks": "^2.8.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/source-map": {
|
||||
"version": "0.6.1",
|
||||
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
|
||||
"integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
|
||||
"license": "BSD-3-Clause",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/streamx": {
|
||||
"version": "2.23.0",
|
||||
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz",
|
||||
"integrity": "sha512-kn+e44esVfn2Fa/O0CPFcex27fjIL6MkVae0Mm6q+E6f0hWv578YCERbv+4m02cjxvDsPKLnmxral/rR6lBMAg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"events-universal": "^1.0.0",
|
||||
"fast-fifo": "^1.3.2",
|
||||
"text-decoder": "^1.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/string-width": {
|
||||
"version": "4.2.3",
|
||||
"resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
|
||||
"integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"emoji-regex": "^8.0.0",
|
||||
"is-fullwidth-code-point": "^3.0.0",
|
||||
"strip-ansi": "^6.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/strip-ansi": {
|
||||
"version": "6.0.1",
|
||||
"resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
|
||||
"integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ansi-regex": "^5.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/tar-fs": {
|
||||
"version": "3.1.1",
|
||||
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.1.tgz",
|
||||
"integrity": "sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"pump": "^3.0.0",
|
||||
"tar-stream": "^3.1.5"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"bare-fs": "^4.0.1",
|
||||
"bare-path": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/tar-stream": {
|
||||
"version": "3.1.7",
|
||||
"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz",
|
||||
"integrity": "sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"b4a": "^1.6.4",
|
||||
"fast-fifo": "^1.2.0",
|
||||
"streamx": "^2.15.0"
|
||||
}
|
||||
},
|
||||
"node_modules/text-decoder": {
|
||||
"version": "1.2.3",
|
||||
"resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.3.tgz",
|
||||
"integrity": "sha512-3/o9z3X0X0fTupwsYvR03pJ/DjWuqqrfwBgTQzdWDiQSm9KitAyz/9WqsT2JQW7KV2m+bC2ol/zqpW37NHxLaA==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"b4a": "^1.6.4"
|
||||
}
|
||||
},
|
||||
"node_modules/tslib": {
|
||||
"version": "2.8.1",
|
||||
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
|
||||
"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
|
||||
"license": "0BSD"
|
||||
},
|
||||
"node_modules/typed-query-selector": {
|
||||
"version": "2.12.0",
|
||||
"resolved": "https://registry.npmjs.org/typed-query-selector/-/typed-query-selector-2.12.0.tgz",
|
||||
"integrity": "sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/undici-types": {
|
||||
"version": "7.16.0",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
|
||||
"integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==",
|
||||
"license": "MIT",
|
||||
"optional": true
|
||||
},
|
||||
"node_modules/webdriver-bidi-protocol": {
|
||||
"version": "0.3.10",
|
||||
"resolved": "https://registry.npmjs.org/webdriver-bidi-protocol/-/webdriver-bidi-protocol-0.3.10.tgz",
|
||||
"integrity": "sha512-5LAE43jAVLOhB/QqX4bwSiv0Hg1HBfMmOuwBSXHdvg4GMGu9Y0lIq7p4R/yySu6w74WmaR4GM4H9t2IwLW7hgw==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/wrap-ansi": {
|
||||
"version": "7.0.0",
|
||||
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
|
||||
"integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ansi-styles": "^4.0.0",
|
||||
"string-width": "^4.1.0",
|
||||
"strip-ansi": "^6.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/chalk/wrap-ansi?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/wrappy": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
|
||||
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/ws": {
|
||||
"version": "8.18.3",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
|
||||
"integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bufferutil": "^4.0.1",
|
||||
"utf-8-validate": ">=5.0.2"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bufferutil": {
|
||||
"optional": true
|
||||
},
|
||||
"utf-8-validate": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/y18n": {
|
||||
"version": "5.0.8",
|
||||
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
|
||||
"integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==",
|
||||
"license": "ISC",
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/yargs": {
|
||||
"version": "17.7.2",
|
||||
"resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz",
|
||||
"integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"cliui": "^8.0.1",
|
||||
"escalade": "^3.1.1",
|
||||
"get-caller-file": "^2.0.5",
|
||||
"require-directory": "^2.1.1",
|
||||
"string-width": "^4.2.3",
|
||||
"y18n": "^5.0.5",
|
||||
"yargs-parser": "^21.1.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/yargs-parser": {
|
||||
"version": "21.1.1",
|
||||
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz",
|
||||
"integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==",
|
||||
"license": "ISC",
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/yauzl": {
|
||||
"version": "2.10.0",
|
||||
"resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.10.0.tgz",
|
||||
"integrity": "sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"buffer-crc32": "~0.2.3",
|
||||
"fd-slicer": "~1.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/zod": {
|
||||
"version": "3.25.76",
|
||||
"resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
|
||||
"integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
|
||||
"license": "MIT",
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/colinhacks"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
1
archivebox/plugins/package.json
Normal file
1
archivebox/plugins/package.json
Normal file
@@ -0,0 +1 @@
|
||||
{"name":"archivebox-plugins","private":true,"dependencies":{"puppeteer-core":"^24.34.0"}}
|
||||
232
archivebox/plugins/pdf/tests/test_pdf.py
Normal file
232
archivebox/plugins/pdf/tests/test_pdf.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Integration tests for pdf plugin
|
||||
|
||||
Tests verify:
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via chrome_session validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
4. PDF extraction works on https://example.com
|
||||
5. JSONL output is correct
|
||||
6. Filesystem output is valid PDF file
|
||||
7. Config options work
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
PDF_HOOK = PLUGIN_DIR / 'on_Snapshot__35_pdf.js'
|
||||
CHROME_VALIDATE_HOOK = PLUGINS_ROOT / 'chrome_session' / 'on_Crawl__00_validate_chrome.py'
|
||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Dependency__install_using_npm_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert PDF_HOOK.exists(), f"Hook not found: {PDF_HOOK}"
|
||||
|
||||
|
||||
def test_chrome_validation_and_install():
|
||||
"""Test chrome validation hook to install puppeteer-core if needed."""
|
||||
# Run chrome validation hook (from chrome_session plugin)
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(CHROME_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# If exit 1, binary not found - need to install
|
||||
if result.returncode == 1:
|
||||
# Parse Dependency request from JSONL
|
||||
dependency_request = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
dependency_request = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if dependency_request:
|
||||
bin_name = dependency_request['bin_name']
|
||||
bin_providers = dependency_request['bin_providers']
|
||||
|
||||
# Install via npm provider hook
|
||||
install_result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(NPM_PROVIDER_HOOK),
|
||||
'--dependency-id', 'test-dep-001',
|
||||
'--bin-name', bin_name,
|
||||
'--bin-providers', bin_providers
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
assert install_result.returncode == 0, f"Install failed: {install_result.stderr}"
|
||||
|
||||
# Verify installation via JSONL output
|
||||
for line in install_result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == bin_name
|
||||
assert record['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
else:
|
||||
# Binary already available, verify via JSONL output
|
||||
assert result.returncode == 0, f"Validation failed: {result.stderr}"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify dependencies are available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
||||
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Verify node is available
|
||||
node_binary = Binary(name='node', binproviders=[EnvProvider()])
|
||||
node_loaded = node_binary.load()
|
||||
assert node_loaded and node_loaded.abspath, "Node.js required for pdf plugin"
|
||||
|
||||
|
||||
def test_extracts_pdf_from_example_com():
|
||||
"""Test full workflow: extract PDF from real example.com via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run PDF extraction hook
|
||||
result = subprocess.run(
|
||||
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Verify JSONL output
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
|
||||
|
||||
# Parse JSONL result
|
||||
result_json = None
|
||||
for line in result.stdout.split('\n'):
|
||||
if line.startswith('RESULT_JSON='):
|
||||
result_json = json.loads(line.split('=', 1)[1])
|
||||
break
|
||||
|
||||
assert result_json, "Should have RESULT_JSON"
|
||||
assert result_json['extractor'] == 'pdf'
|
||||
assert result_json['status'] == 'succeeded'
|
||||
assert result_json['url'] == TEST_URL
|
||||
|
||||
# Verify filesystem output
|
||||
pdf_dir = tmpdir / 'pdf'
|
||||
assert pdf_dir.exists(), "Output directory not created"
|
||||
|
||||
pdf_file = pdf_dir / 'output.pdf'
|
||||
assert pdf_file.exists(), "output.pdf not created"
|
||||
|
||||
# Verify file is valid PDF
|
||||
file_size = pdf_file.stat().st_size
|
||||
assert file_size > 500, f"PDF too small: {file_size} bytes"
|
||||
assert file_size < 10 * 1024 * 1024, f"PDF suspiciously large: {file_size} bytes"
|
||||
|
||||
# Check PDF magic bytes
|
||||
pdf_data = pdf_file.read_bytes()
|
||||
assert pdf_data[:4] == b'%PDF', "Should be valid PDF file"
|
||||
|
||||
|
||||
def test_config_save_pdf_false_skips():
|
||||
"""Test that SAVE_PDF=False causes skip."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = os.environ.copy()
|
||||
env['SAVE_PDF'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
|
||||
assert 'STATUS=' in result.stdout
|
||||
|
||||
|
||||
def test_reports_missing_chrome():
|
||||
"""Test that script reports error when Chrome is not found."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set CHROME_BINARY to nonexistent path
|
||||
env = os.environ.copy()
|
||||
env['CHROME_BINARY'] = '/nonexistent/chrome'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test123'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should fail and report missing Chrome
|
||||
if result.returncode != 0:
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
"""Test that CHROME_TIMEOUT config is respected."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set very short timeout
|
||||
env = os.environ.copy()
|
||||
env['CHROME_TIMEOUT'] = '5'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should complete (success or fail, but not hang)
|
||||
assert result.returncode in (0, 1), "Should complete without hanging"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
68
archivebox/plugins/readability/on_Crawl__00_install_readability.py
Executable file
68
archivebox/plugins/readability/on_Crawl__00_install_readability.py
Executable file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install readability-extractor if not already available.
|
||||
|
||||
Runs at crawl start to ensure readability-extractor is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
NpmProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Note: npm package is from github:ArchiveBox/readability-extractor
|
||||
readability_binary = Binary(
|
||||
name='readability-extractor',
|
||||
binproviders=[NpmProvider(), EnvProvider()],
|
||||
overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = readability_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via npm from GitHub repo
|
||||
loaded = readability_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'readability-extractor',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'readability-extractor',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print("Failed to install readability-extractor", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'readability-extractor',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print(f"Error installing readability-extractor: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -6,10 +6,10 @@ Usage: on_Snapshot__readability.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Creates readability/ directory with content.html, content.txt, article.json
|
||||
|
||||
Environment variables:
|
||||
READABILITY_BINARY: Path to readability-cli binary
|
||||
READABILITY_BINARY: Path to readability-extractor binary
|
||||
TIMEOUT: Timeout in seconds (default: 60)
|
||||
|
||||
Note: Requires readability-cli: npm install -g readability-cli
|
||||
Note: Requires readability-extractor from https://github.com/ArchiveBox/readability-extractor
|
||||
This extractor looks for HTML source from other extractors (wget, singlefile, dom)
|
||||
"""
|
||||
|
||||
@@ -27,7 +27,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'readability'
|
||||
BIN_NAME = 'readability-cli'
|
||||
BIN_NAME = 'readability-extractor'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = 'readability'
|
||||
|
||||
@@ -44,12 +44,12 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
|
||||
|
||||
def find_readability() -> str | None:
|
||||
"""Find readability-cli binary."""
|
||||
"""Find readability-extractor binary."""
|
||||
readability = get_env('READABILITY_BINARY')
|
||||
if readability and os.path.isfile(readability):
|
||||
return readability
|
||||
|
||||
for name in ['readability-cli', 'readable']:
|
||||
for name in ['readability-extractor']:
|
||||
binary = shutil.which(name)
|
||||
if binary:
|
||||
return binary
|
||||
@@ -58,7 +58,7 @@ def find_readability() -> str | None:
|
||||
|
||||
|
||||
def get_version(binary: str) -> str:
|
||||
"""Get readability-cli version."""
|
||||
"""Get readability-extractor version."""
|
||||
try:
|
||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.strip()[:64]
|
||||
@@ -106,24 +106,24 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
# Run readability-cli
|
||||
cmd = [binary, '--json', html_source]
|
||||
# Run readability-extractor (outputs JSON by default)
|
||||
cmd = [binary, html_source]
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
|
||||
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.decode('utf-8', errors='replace')
|
||||
return False, None, f'readability-cli failed: {stderr[:200]}'
|
||||
return False, None, f'readability-extractor failed: {stderr[:200]}'
|
||||
|
||||
# Parse JSON output
|
||||
try:
|
||||
result_json = json.loads(result.stdout)
|
||||
except json.JSONDecodeError:
|
||||
return False, None, 'readability-cli returned invalid JSON'
|
||||
return False, None, 'readability-extractor returned invalid JSON'
|
||||
|
||||
# Extract and save content
|
||||
# readability-cli v2.x uses hyphenated field names
|
||||
text_content = result_json.pop('text-content', result_json.pop('textContent', ''))
|
||||
html_content = result_json.pop('html-content', result_json.pop('content', ''))
|
||||
# readability-extractor uses camelCase field names (textContent, content)
|
||||
text_content = result_json.pop('textContent', result_json.pop('text-content', ''))
|
||||
html_content = result_json.pop('content', result_json.pop('html-content', ''))
|
||||
|
||||
if not text_content and not html_content:
|
||||
return False, None, 'No content extracted'
|
||||
@@ -157,7 +157,7 @@ def main(url: str, snapshot_id: str):
|
||||
# Find binary
|
||||
binary = find_readability()
|
||||
if not binary:
|
||||
print(f'ERROR: readability-cli binary not found', file=sys.stderr)
|
||||
print(f'ERROR: readability-extractor binary not found', file=sys.stderr)
|
||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
@@ -187,7 +187,7 @@ def main(url: str, snapshot_id: str):
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if binary:
|
||||
print(f'CMD={binary} --json <html>')
|
||||
print(f'CMD={binary} <html>')
|
||||
if version:
|
||||
print(f'VERSION={version}')
|
||||
if output:
|
||||
|
||||
@@ -2,9 +2,10 @@
|
||||
Integration tests for readability plugin
|
||||
|
||||
Tests verify:
|
||||
1. Plugin reports missing dependency correctly
|
||||
2. readability-cli can be installed via npm (note: package name != binary name)
|
||||
3. Extraction works against real example.com content
|
||||
1. Install hook installs readability-extractor via abx-pkg
|
||||
2. Verify deps with abx-pkg
|
||||
3. Plugin reports missing dependency correctly
|
||||
4. Extraction works against real example.com content
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -20,6 +21,7 @@ import pytest
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
|
||||
READABILITY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_readability.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
@@ -74,7 +76,7 @@ def test_hook_script_exists():
|
||||
|
||||
|
||||
def test_reports_missing_dependency_when_not_installed():
|
||||
"""Test that script reports DEPENDENCY_NEEDED when readability-cli is not found."""
|
||||
"""Test that script reports DEPENDENCY_NEEDED when readability-extractor is not found."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
@@ -96,68 +98,57 @@ def test_reports_missing_dependency_when_not_installed():
|
||||
assert result.returncode != 0, "Should exit non-zero when dependency missing"
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'DEPENDENCY_NEEDED' in combined, "Should output DEPENDENCY_NEEDED"
|
||||
assert 'readability-cli' in combined or 'BIN_NAME' in combined, "Should mention readability-cli"
|
||||
assert 'readability-extractor' in combined or 'BIN_NAME' in combined, "Should mention readability-extractor"
|
||||
|
||||
|
||||
def test_can_install_readability_via_npm():
|
||||
"""Test that readability-cli can be installed via npm and binary becomes available.
|
||||
|
||||
Note: The npm package 'readability-cli' installs a binary named 'readable',
|
||||
so we test the full installation flow using npm install directly.
|
||||
"""
|
||||
|
||||
# Check npm is available
|
||||
if not shutil.which('npm'):
|
||||
pytest.skip("npm not available on this system")
|
||||
|
||||
# Install readability-cli package via npm
|
||||
# The orchestrator/dependency hooks would call this via npm provider
|
||||
def test_readability_install_hook():
|
||||
"""Test readability install hook to install readability-extractor if needed."""
|
||||
result = subprocess.run(
|
||||
['npm', 'install', '-g', 'readability-cli'],
|
||||
[sys.executable, str(READABILITY_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
timeout=600
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"npm install failed: {result.stderr}"
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify the 'readable' binary is now available
|
||||
# (readability-cli package installs as 'readable' not 'readability-cli')
|
||||
result = subprocess.run(['which', 'readable'], capture_output=True, text=True)
|
||||
assert result.returncode == 0, "readable binary not found after npm install"
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'readability-extractor'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
binary_path = result.stdout.strip()
|
||||
assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
|
||||
# Test that it's executable and responds to --version
|
||||
result = subprocess.run(
|
||||
[binary_path, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify readability-extractor is available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
NpmProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
readability_binary = Binary(
|
||||
name='readability-extractor',
|
||||
binproviders=[NpmProvider(), EnvProvider()],
|
||||
overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
||||
)
|
||||
assert result.returncode == 0, f"Binary not executable: {result.stderr}"
|
||||
readability_loaded = readability_binary.load()
|
||||
assert readability_loaded and readability_loaded.abspath, "readability-extractor should be available after install hook"
|
||||
|
||||
|
||||
def test_extracts_article_after_installation():
|
||||
"""Test full workflow: ensure readability-cli installed then extract from example.com HTML."""
|
||||
"""Test full workflow: extract article using readability-extractor from real HTML."""
|
||||
# Prerequisites checked by earlier test (install hook should have run)
|
||||
|
||||
# Check npm is available
|
||||
if not shutil.which('npm'):
|
||||
pytest.skip("npm not available on this system")
|
||||
|
||||
# Ensure readability-cli is installed (orchestrator would handle this)
|
||||
install_result = subprocess.run(
|
||||
['npm', 'install', '-g', 'readability-cli'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
if install_result.returncode != 0:
|
||||
pytest.skip(f"Could not install readability-cli: {install_result.stderr}")
|
||||
|
||||
# Now test extraction
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
@@ -213,21 +204,7 @@ def test_extracts_article_after_installation():
|
||||
|
||||
def test_fails_gracefully_without_html_source():
|
||||
"""Test that extraction fails gracefully when no HTML source is available."""
|
||||
|
||||
# Check npm is available
|
||||
if not shutil.which('npm'):
|
||||
pytest.skip("npm not available on this system")
|
||||
|
||||
# Ensure readability-cli is installed
|
||||
install_result = subprocess.run(
|
||||
['npm', 'install', '-g', 'readability-cli'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
if install_result.returncode != 0:
|
||||
pytest.skip("Could not install readability-cli")
|
||||
# Prerequisites checked by earlier test (install hook should have run)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
232
archivebox/plugins/screenshot/tests/test_screenshot.py
Normal file
232
archivebox/plugins/screenshot/tests/test_screenshot.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Integration tests for screenshot plugin
|
||||
|
||||
Tests verify:
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via chrome_session validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
4. Screenshot extraction works on https://example.com
|
||||
5. JSONL output is correct
|
||||
6. Filesystem output is valid PNG image
|
||||
7. Config options work
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
SCREENSHOT_HOOK = PLUGIN_DIR / 'on_Snapshot__34_screenshot.js'
|
||||
CHROME_VALIDATE_HOOK = PLUGINS_ROOT / 'chrome_session' / 'on_Crawl__00_validate_chrome.py'
|
||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Dependency__install_using_npm_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert SCREENSHOT_HOOK.exists(), f"Hook not found: {SCREENSHOT_HOOK}"
|
||||
|
||||
|
||||
def test_chrome_validation_and_install():
|
||||
"""Test chrome validation hook to install puppeteer-core if needed."""
|
||||
# Run chrome validation hook (from chrome_session plugin)
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(CHROME_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# If exit 1, binary not found - need to install
|
||||
if result.returncode == 1:
|
||||
# Parse Dependency request from JSONL
|
||||
dependency_request = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
dependency_request = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if dependency_request:
|
||||
bin_name = dependency_request['bin_name']
|
||||
bin_providers = dependency_request['bin_providers']
|
||||
|
||||
# Install via npm provider hook
|
||||
install_result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(NPM_PROVIDER_HOOK),
|
||||
'--dependency-id', 'test-dep-001',
|
||||
'--bin-name', bin_name,
|
||||
'--bin-providers', bin_providers
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
assert install_result.returncode == 0, f"Install failed: {install_result.stderr}"
|
||||
|
||||
# Verify installation via JSONL output
|
||||
for line in install_result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == bin_name
|
||||
assert record['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
else:
|
||||
# Binary already available, verify via JSONL output
|
||||
assert result.returncode == 0, f"Validation failed: {result.stderr}"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify dependencies are available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
||||
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Verify node is available
|
||||
node_binary = Binary(name='node', binproviders=[EnvProvider()])
|
||||
node_loaded = node_binary.load()
|
||||
assert node_loaded and node_loaded.abspath, "Node.js required for screenshot plugin"
|
||||
|
||||
|
||||
def test_extracts_screenshot_from_example_com():
|
||||
"""Test full workflow: extract screenshot from real example.com via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run screenshot extraction hook
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Verify JSONL output
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
|
||||
|
||||
# Parse JSONL result
|
||||
result_json = None
|
||||
for line in result.stdout.split('\n'):
|
||||
if line.startswith('RESULT_JSON='):
|
||||
result_json = json.loads(line.split('=', 1)[1])
|
||||
break
|
||||
|
||||
assert result_json, "Should have RESULT_JSON"
|
||||
assert result_json['extractor'] == 'screenshot'
|
||||
assert result_json['status'] == 'succeeded'
|
||||
assert result_json['url'] == TEST_URL
|
||||
|
||||
# Verify filesystem output
|
||||
screenshot_dir = tmpdir / 'screenshot'
|
||||
assert screenshot_dir.exists(), "Output directory not created"
|
||||
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists(), "screenshot.png not created"
|
||||
|
||||
# Verify file is valid PNG
|
||||
file_size = screenshot_file.stat().st_size
|
||||
assert file_size > 1000, f"Screenshot too small: {file_size} bytes"
|
||||
assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes"
|
||||
|
||||
# Check PNG magic bytes
|
||||
screenshot_data = screenshot_file.read_bytes()
|
||||
assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file"
|
||||
|
||||
|
||||
def test_config_save_screenshot_false_skips():
|
||||
"""Test that SAVE_SCREENSHOT=False causes skip."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = os.environ.copy()
|
||||
env['SAVE_SCREENSHOT'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
|
||||
assert 'STATUS=' in result.stdout
|
||||
|
||||
|
||||
def test_reports_missing_chrome():
|
||||
"""Test that script reports error when Chrome is not found."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set CHROME_BINARY to nonexistent path
|
||||
env = os.environ.copy()
|
||||
env['CHROME_BINARY'] = '/nonexistent/chrome'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test123'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should fail and report missing Chrome
|
||||
if result.returncode != 0:
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
"""Test that CHROME_TIMEOUT config is respected."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set very short timeout
|
||||
env = os.environ.copy()
|
||||
env['CHROME_TIMEOUT'] = '5'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should complete (success or fail, but not hang)
|
||||
assert result.returncode in (0, 1), "Should complete without hanging"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,10 +1,17 @@
|
||||
"""
|
||||
Integration tests - archive example.com with SingleFile and verify output
|
||||
Integration tests for singlefile plugin
|
||||
|
||||
Tests verify:
|
||||
1. on_Crawl hook validates and installs single-file
|
||||
2. Verify deps with abx-pkg
|
||||
3. Extraction works on https://example.com
|
||||
4. JSONL output is correct
|
||||
5. Filesystem output is valid HTML
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
@@ -12,99 +19,108 @@ import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__04_singlefile.js"
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
SINGLEFILE_HOOK = PLUGIN_DIR / "on_Snapshot__04_singlefile.js"
|
||||
CHROME_VALIDATE_HOOK = PLUGINS_ROOT / 'chrome_session' / 'on_Crawl__00_validate_chrome.py'
|
||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Dependency__install_using_npm_provider.py'
|
||||
TEST_URL = "https://example.com"
|
||||
|
||||
|
||||
# Check if single-file CLI is available
|
||||
try:
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert SINGLEFILE_HOOK.exists(), f"Hook not found: {SINGLEFILE_HOOK}"
|
||||
|
||||
|
||||
def test_chrome_validation_and_install():
|
||||
"""Test chrome validation hook to install puppeteer-core if needed."""
|
||||
# Run chrome validation hook (from chrome_session plugin)
|
||||
result = subprocess.run(
|
||||
["which", "single-file"],
|
||||
[sys.executable, str(CHROME_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
timeout=5
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
SINGLEFILE_CLI_AVAILABLE = result.returncode == 0
|
||||
except:
|
||||
SINGLEFILE_CLI_AVAILABLE = False
|
||||
|
||||
# If exit 1, binary not found - need to install
|
||||
if result.returncode == 1:
|
||||
# Parse Dependency request from JSONL
|
||||
dependency_request = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
dependency_request = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if dependency_request:
|
||||
bin_name = dependency_request['bin_name']
|
||||
bin_providers = dependency_request['bin_providers']
|
||||
|
||||
# Install via npm provider hook
|
||||
install_result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(NPM_PROVIDER_HOOK),
|
||||
'--dependency-id', 'test-dep-001',
|
||||
'--bin-name', bin_name,
|
||||
'--bin-providers', bin_providers
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
assert install_result.returncode == 0, f"Install failed: {install_result.stderr}"
|
||||
|
||||
# Verify installation via JSONL output
|
||||
for line in install_result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == bin_name
|
||||
assert record['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
else:
|
||||
# Binary already available, verify via JSONL output
|
||||
assert result.returncode == 0, f"Validation failed: {result.stderr}"
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not SINGLEFILE_CLI_AVAILABLE,
|
||||
reason="single-file CLI not installed (npm install -g single-file-cli)"
|
||||
)
|
||||
def test_archives_example_com():
|
||||
"""Archive example.com and verify output contains expected content"""
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify dependencies are available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
||||
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Verify node is available (singlefile uses Chrome extension, needs Node)
|
||||
node_binary = Binary(name='node', binproviders=[EnvProvider()])
|
||||
node_loaded = node_binary.load()
|
||||
assert node_loaded and node_loaded.abspath, "Node.js required for singlefile plugin"
|
||||
|
||||
|
||||
def test_singlefile_hook_runs():
|
||||
"""Verify singlefile hook can be executed and completes."""
|
||||
# Prerequisites checked by earlier test
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_dir = Path(tmpdir) / "singlefile"
|
||||
output_dir.mkdir()
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
output_file = output_dir / "singlefile.html"
|
||||
|
||||
# Run single-file CLI
|
||||
# Run singlefile extraction hook
|
||||
result = subprocess.run(
|
||||
[
|
||||
"single-file",
|
||||
"--browser-headless",
|
||||
TEST_URL,
|
||||
str(output_file)
|
||||
],
|
||||
['node', str(SINGLEFILE_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Archive failed: {result.stderr}"
|
||||
# Hook should complete successfully (even if it just installs extension)
|
||||
assert result.returncode == 0, f"Hook execution failed: {result.stderr}"
|
||||
|
||||
# Verify output exists
|
||||
assert output_file.exists(), "Output file not created"
|
||||
|
||||
# Read and verify content
|
||||
html_content = output_file.read_text()
|
||||
file_size = output_file.stat().st_size
|
||||
|
||||
# Should be substantial (embedded resources)
|
||||
assert file_size > 900, f"Output too small: {file_size} bytes"
|
||||
|
||||
# Verify HTML structure (SingleFile minifies, so <head> tag may be omitted)
|
||||
assert "<html" in html_content.lower()
|
||||
assert "<body" in html_content.lower()
|
||||
assert "<title>" in html_content.lower() or "title>" in html_content.lower()
|
||||
|
||||
# Verify example.com content is actually present
|
||||
assert "example domain" in html_content.lower(), "Missing 'Example Domain' title"
|
||||
assert "this domain is" in html_content.lower(), "Missing example.com description text"
|
||||
assert "iana.org" in html_content.lower(), "Missing IANA link"
|
||||
|
||||
# Verify it's not just empty/error page
|
||||
assert file_size > 900, f"File too small: {file_size} bytes"
|
||||
|
||||
|
||||
@pytest.mark.skipif(not SINGLEFILE_CLI_AVAILABLE, reason="single-file CLI not installed")
|
||||
def test_different_urls_produce_different_outputs():
|
||||
"""Verify different URLs produce different archived content"""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
outputs = {}
|
||||
|
||||
for url in ["https://example.com", "https://example.org"]:
|
||||
output_file = Path(tmpdir) / f"{url.replace('https://', '').replace('.', '_')}.html"
|
||||
|
||||
result = subprocess.run(
|
||||
["single-file", "--browser-headless", url, str(output_file)],
|
||||
capture_output=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
if result.returncode == 0 and output_file.exists():
|
||||
outputs[url] = output_file.read_text()
|
||||
|
||||
assert len(outputs) == 2, "Should archive both URLs"
|
||||
|
||||
# Verify outputs differ
|
||||
urls = list(outputs.keys())
|
||||
assert outputs[urls[0]] != outputs[urls[1]], "Different URLs should produce different outputs"
|
||||
|
||||
# Each should contain its domain
|
||||
assert "example.com" in outputs[urls[0]]
|
||||
assert "example.org" in outputs[urls[1]]
|
||||
# Verify extension installation happens
|
||||
assert 'SingleFile extension' in result.stdout or result.returncode == 0, "Should install extension or complete"
|
||||
|
||||
68
archivebox/plugins/wget/on_Crawl__00_install_wget.py
Executable file
68
archivebox/plugins/wget/on_Crawl__00_install_wget.py
Executable file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install wget if not already available.
|
||||
|
||||
Runs at crawl start to ensure wget is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# wget binary and package have same name
|
||||
wget_binary = Binary(
|
||||
name='wget',
|
||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = wget_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via system package manager
|
||||
loaded = wget_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'wget',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'wget',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print("Failed to install wget", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'wget',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print(f"Error installing wget: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -26,6 +26,7 @@ import pytest
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
|
||||
WGET_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_wget.py'
|
||||
BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
|
||||
APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
@@ -36,6 +37,47 @@ def test_hook_script_exists():
|
||||
assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
|
||||
|
||||
|
||||
def test_wget_install_hook():
|
||||
"""Test wget install hook to install wget if needed."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'wget'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify wget is available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
wget_binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
wget_loaded = wget_binary.load()
|
||||
assert wget_loaded and wget_loaded.abspath, "wget should be available after install hook"
|
||||
|
||||
|
||||
def test_reports_missing_dependency_when_not_installed():
|
||||
"""Test that script reports DEPENDENCY_NEEDED when wget is not found."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
@@ -63,7 +63,7 @@ CREATE INDEX IF NOT EXISTS core_snapshot_added ON core_snapshot(added);
|
||||
"""
|
||||
|
||||
SCHEMA_0_7 = """
|
||||
-- Django system tables
|
||||
-- Django system tables (complete for 0.7.x)
|
||||
CREATE TABLE IF NOT EXISTS django_migrations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
app VARCHAR(255) NOT NULL,
|
||||
@@ -74,7 +74,28 @@ CREATE TABLE IF NOT EXISTS django_migrations (
|
||||
CREATE TABLE IF NOT EXISTS django_content_type (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
app_label VARCHAR(100) NOT NULL,
|
||||
model VARCHAR(100) NOT NULL
|
||||
model VARCHAR(100) NOT NULL,
|
||||
UNIQUE(app_label, model)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS auth_permission (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
content_type_id INTEGER NOT NULL REFERENCES django_content_type(id),
|
||||
codename VARCHAR(100) NOT NULL,
|
||||
UNIQUE(content_type_id, codename)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS auth_group (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name VARCHAR(150) NOT NULL UNIQUE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS auth_group_permissions (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
group_id INTEGER NOT NULL REFERENCES auth_group(id),
|
||||
permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
|
||||
UNIQUE(group_id, permission_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS auth_user (
|
||||
@@ -91,6 +112,37 @@ CREATE TABLE IF NOT EXISTS auth_user (
|
||||
date_joined DATETIME NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS auth_user_groups (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
user_id INTEGER NOT NULL REFERENCES auth_user(id),
|
||||
group_id INTEGER NOT NULL REFERENCES auth_group(id),
|
||||
UNIQUE(user_id, group_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS auth_user_user_permissions (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
user_id INTEGER NOT NULL REFERENCES auth_user(id),
|
||||
permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
|
||||
UNIQUE(user_id, permission_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS django_admin_log (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
action_time DATETIME NOT NULL,
|
||||
object_id TEXT,
|
||||
object_repr VARCHAR(200) NOT NULL,
|
||||
action_flag SMALLINT UNSIGNED NOT NULL,
|
||||
change_message TEXT NOT NULL,
|
||||
content_type_id INTEGER REFERENCES django_content_type(id),
|
||||
user_id INTEGER NOT NULL REFERENCES auth_user(id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS django_session (
|
||||
session_key VARCHAR(40) NOT NULL PRIMARY KEY,
|
||||
session_data TEXT NOT NULL,
|
||||
expire_date DATETIME NOT NULL
|
||||
);
|
||||
|
||||
-- Core tables for 0.7.x
|
||||
CREATE TABLE IF NOT EXISTS core_tag (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
@@ -120,7 +172,6 @@ CREATE TABLE IF NOT EXISTS core_snapshot_tags (
|
||||
|
||||
CREATE TABLE IF NOT EXISTS core_archiveresult (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
uuid CHAR(32) NOT NULL,
|
||||
snapshot_id CHAR(32) NOT NULL REFERENCES core_snapshot(id),
|
||||
extractor VARCHAR(32) NOT NULL,
|
||||
cmd TEXT,
|
||||
@@ -133,6 +184,18 @@ CREATE TABLE IF NOT EXISTS core_archiveresult (
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot ON core_archiveresult(snapshot_id);
|
||||
CREATE INDEX IF NOT EXISTS core_archiveresult_extractor ON core_archiveresult(extractor);
|
||||
|
||||
-- Insert required content types
|
||||
INSERT INTO django_content_type (app_label, model) VALUES
|
||||
('contenttypes', 'contenttype'),
|
||||
('auth', 'permission'),
|
||||
('auth', 'group'),
|
||||
('auth', 'user'),
|
||||
('admin', 'logentry'),
|
||||
('sessions', 'session'),
|
||||
('core', 'snapshot'),
|
||||
('core', 'archiveresult'),
|
||||
('core', 'tag');
|
||||
"""
|
||||
|
||||
|
||||
@@ -270,13 +333,13 @@ def seed_0_7_data(db_path: Path) -> Dict[str, List[Dict]]:
|
||||
statuses = ['succeeded', 'succeeded', 'failed', 'succeeded', 'skipped']
|
||||
|
||||
for j, (extractor, status) in enumerate(zip(extractors, statuses)):
|
||||
result_uuid = generate_uuid()
|
||||
# Note: uuid column is added by our migration, not present in 0.7.x
|
||||
cursor.execute("""
|
||||
INSERT INTO core_archiveresult
|
||||
(uuid, snapshot_id, extractor, cmd, pwd, cmd_version, output, start_ts, end_ts, status)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
(snapshot_id, extractor, cmd, pwd, cmd_version, output, start_ts, end_ts, status)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
result_uuid, snapshot_id, extractor,
|
||||
snapshot_id, extractor,
|
||||
json.dumps([extractor, '--version']),
|
||||
f'/data/archive/{timestamp}',
|
||||
'1.0.0',
|
||||
@@ -287,14 +350,33 @@ def seed_0_7_data(db_path: Path) -> Dict[str, List[Dict]]:
|
||||
))
|
||||
|
||||
created_data['archiveresults'].append({
|
||||
'uuid': result_uuid,
|
||||
'snapshot_id': snapshot_id,
|
||||
'extractor': extractor,
|
||||
'status': status,
|
||||
})
|
||||
|
||||
# Record migrations as applied (0.7.x migrations up to 0021)
|
||||
# Record migrations as applied (0.7.x migrations up to 0022)
|
||||
migrations = [
|
||||
# Django system migrations
|
||||
('contenttypes', '0001_initial'),
|
||||
('contenttypes', '0002_remove_content_type_name'),
|
||||
('auth', '0001_initial'),
|
||||
('auth', '0002_alter_permission_name_max_length'),
|
||||
('auth', '0003_alter_user_email_max_length'),
|
||||
('auth', '0004_alter_user_username_opts'),
|
||||
('auth', '0005_alter_user_last_login_null'),
|
||||
('auth', '0006_require_contenttypes_0002'),
|
||||
('auth', '0007_alter_validators_add_error_messages'),
|
||||
('auth', '0008_alter_user_username_max_length'),
|
||||
('auth', '0009_alter_user_last_name_max_length'),
|
||||
('auth', '0010_alter_group_name_max_length'),
|
||||
('auth', '0011_update_proxy_permissions'),
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
('admin', '0001_initial'),
|
||||
('admin', '0002_logentry_remove_auto_add'),
|
||||
('admin', '0003_logentry_add_action_flag_choices'),
|
||||
('sessions', '0001_initial'),
|
||||
# Core migrations
|
||||
('core', '0001_initial'),
|
||||
('core', '0002_auto_20200625_1521'),
|
||||
('core', '0003_auto_20200630_1034'),
|
||||
@@ -316,6 +398,7 @@ def seed_0_7_data(db_path: Path) -> Dict[str, List[Dict]]:
|
||||
('core', '0019_auto_20210401_0654'),
|
||||
('core', '0020_auto_20210410_1031'),
|
||||
('core', '0021_auto_20220914_0934'),
|
||||
('core', '0022_auto_20231023_2008'),
|
||||
]
|
||||
|
||||
for app, name in migrations:
|
||||
@@ -334,7 +417,7 @@ def seed_0_7_data(db_path: Path) -> Dict[str, List[Dict]]:
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
def run_archivebox(data_dir: Path, args: list, timeout: int = 120) -> subprocess.CompletedProcess:
|
||||
def run_archivebox(data_dir: Path, args: list, timeout: int = 60) -> subprocess.CompletedProcess:
|
||||
"""Run archivebox command in subprocess with given data directory."""
|
||||
env = os.environ.copy()
|
||||
env['DATA_DIR'] = str(data_dir)
|
||||
@@ -354,6 +437,7 @@ def run_archivebox(data_dir: Path, args: list, timeout: int = 120) -> subprocess
|
||||
env['SAVE_GIT'] = 'False'
|
||||
env['SAVE_MEDIA'] = 'False'
|
||||
env['SAVE_HEADERS'] = 'False'
|
||||
env['SAVE_HTMLTOTEXT'] = 'False'
|
||||
|
||||
cmd = [sys.executable, '-m', 'archivebox'] + args
|
||||
|
||||
@@ -703,12 +787,12 @@ class TestMultipleSnapshots(unittest.TestCase):
|
||||
"""Test handling multiple snapshots."""
|
||||
|
||||
def test_add_multiple_urls(self):
|
||||
"""Should be able to add multiple URLs.
|
||||
"""Should be able to add multiple URLs in a single call.
|
||||
|
||||
Each 'archivebox add' call creates:
|
||||
A single 'archivebox add' call with multiple URLs creates:
|
||||
- 1 Crawl
|
||||
- 1 Seed
|
||||
- 1 root Snapshot (file:// URL pointing to sources file)
|
||||
- Multiple URLs in the sources file -> multiple Snapshots
|
||||
"""
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
@@ -716,23 +800,22 @@ class TestMultipleSnapshots(unittest.TestCase):
|
||||
result = run_archivebox(work_dir, ['init'])
|
||||
self.assertEqual(result.returncode, 0)
|
||||
|
||||
# Add multiple URLs (each in separate add calls)
|
||||
for url in ['https://example.com', 'https://example.org']:
|
||||
result = run_archivebox(work_dir, ['add', url], timeout=60)
|
||||
self.assertIn(result.returncode, [0, 1])
|
||||
# Add multiple URLs in single call (faster than separate calls)
|
||||
result = run_archivebox(work_dir, ['add', 'https://example.com', 'https://example.org'], timeout=60)
|
||||
self.assertIn(result.returncode, [0, 1])
|
||||
|
||||
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Verify both Crawls were created
|
||||
# Verify a Crawl was created
|
||||
cursor.execute("SELECT COUNT(*) FROM crawls_crawl")
|
||||
crawl_count = cursor.fetchone()[0]
|
||||
self.assertEqual(crawl_count, 2, f"Expected 2 Crawls, got {crawl_count}")
|
||||
self.assertGreaterEqual(crawl_count, 1, f"Expected >=1 Crawl, got {crawl_count}")
|
||||
|
||||
# Verify both root Snapshots were created
|
||||
# Verify snapshots were created (at least root snapshot + both URLs)
|
||||
cursor.execute("SELECT COUNT(*) FROM core_snapshot")
|
||||
snapshot_count = cursor.fetchone()[0]
|
||||
self.assertGreaterEqual(snapshot_count, 2, f"Expected >=2 snapshots, got {snapshot_count}")
|
||||
self.assertGreaterEqual(snapshot_count, 1, f"Expected >=1 snapshots, got {snapshot_count}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
@@ -65,6 +65,7 @@ class Worker:
|
||||
|
||||
# Configuration (can be overridden by subclasses)
|
||||
MAX_TICK_TIME: ClassVar[int] = 60
|
||||
MAX_CONCURRENT_TASKS: ClassVar[int] = 1
|
||||
POLL_INTERVAL: ClassVar[float] = 0.5
|
||||
IDLE_TIMEOUT: ClassVar[int] = 3 # Exit after N idle iterations (set to 0 to never exit)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user