mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-03 14:27:55 +10:00
remove Seed model in favor of Crawl as template
This commit is contained in:
@@ -0,0 +1,113 @@
|
|||||||
|
# Generated by Django 6.0 on 2025-12-25 09:34
|
||||||
|
|
||||||
|
import django.utils.timezone
|
||||||
|
import signal_webhooks.fields
|
||||||
|
import signal_webhooks.utils
|
||||||
|
import uuid
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('api', '0001_squashed'),
|
||||||
|
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterModelOptions(
|
||||||
|
name='outboundwebhook',
|
||||||
|
options={'verbose_name': 'API Outbound Webhook'},
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='created',
|
||||||
|
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now, help_text='When the webhook was created.', verbose_name='created'),
|
||||||
|
preserve_default=False,
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='updated',
|
||||||
|
field=models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='apitoken',
|
||||||
|
name='created_at',
|
||||||
|
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='apitoken',
|
||||||
|
name='id',
|
||||||
|
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='auth_token',
|
||||||
|
field=signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='created_at',
|
||||||
|
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='enabled',
|
||||||
|
field=models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='endpoint',
|
||||||
|
field=models.URLField(help_text='Target endpoint for this webhook.', max_length=2047, verbose_name='endpoint'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='headers',
|
||||||
|
field=models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='id',
|
||||||
|
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='keep_last_response',
|
||||||
|
field=models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='last_failure',
|
||||||
|
field=models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='last_response',
|
||||||
|
field=models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='last_success',
|
||||||
|
field=models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='name',
|
||||||
|
field=models.CharField(db_index=True, help_text='Webhook name.', max_length=255, unique=True, verbose_name='name'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='ref',
|
||||||
|
field=models.CharField(db_index=True, help_text='Dot import notation to the model the webhook is for.', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
name='signal',
|
||||||
|
field=models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='Signal the webhook fires to.', max_length=255, verbose_name='signal'),
|
||||||
|
),
|
||||||
|
migrations.AddConstraint(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
constraint=models.UniqueConstraint(fields=('ref', 'endpoint'), name='prevent_duplicate_hooks_api_outboundwebhook'),
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -15,7 +15,7 @@ from ninja.pagination import paginate, PaginationBase
|
|||||||
from ninja.errors import HttpError
|
from ninja.errors import HttpError
|
||||||
|
|
||||||
from core.models import Snapshot, ArchiveResult, Tag
|
from core.models import Snapshot, ArchiveResult, Tag
|
||||||
from api.v1_crawls import CrawlSchema, SeedSchema
|
from api.v1_crawls import CrawlSchema
|
||||||
|
|
||||||
|
|
||||||
router = Router(tags=['Core Models'])
|
router = Router(tags=['Core Models'])
|
||||||
@@ -271,9 +271,9 @@ def get_tag(request, tag_id: str, with_snapshots: bool = True):
|
|||||||
return Tag.objects.get(slug__icontains=tag_id)
|
return Tag.objects.get(slug__icontains=tag_id)
|
||||||
|
|
||||||
|
|
||||||
@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
|
@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
|
||||||
def get_any(request, id: str):
|
def get_any(request, id: str):
|
||||||
"""Get any object by its ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
|
"""Get any object by its ID (e.g. snapshot, archiveresult, tag, crawl, etc.)."""
|
||||||
request.with_snapshots = False
|
request.with_snapshots = False
|
||||||
request.with_archiveresults = False
|
request.with_archiveresults = False
|
||||||
|
|
||||||
@@ -285,14 +285,6 @@ def get_any(request, id: str):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
|
||||||
from api.v1_crawls import get_seed
|
|
||||||
response = get_seed(request, id)
|
|
||||||
if response:
|
|
||||||
return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.id}?{request.META['QUERY_STRING']}")
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from api.v1_crawls import get_crawl
|
from api.v1_crawls import get_crawl
|
||||||
response = get_crawl(request, id)
|
response = get_crawl(request, id)
|
||||||
|
|||||||
@@ -10,53 +10,13 @@ from django.contrib.auth import get_user_model
|
|||||||
from ninja import Router, Schema
|
from ninja import Router, Schema
|
||||||
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
from crawls.models import Seed, Crawl
|
from crawls.models import Crawl
|
||||||
|
|
||||||
from .auth import API_AUTH_METHODS
|
from .auth import API_AUTH_METHODS
|
||||||
|
|
||||||
router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
|
router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
|
||||||
|
|
||||||
|
|
||||||
class SeedSchema(Schema):
|
|
||||||
TYPE: str = 'crawls.models.Seed'
|
|
||||||
|
|
||||||
id: UUID
|
|
||||||
|
|
||||||
modified_at: datetime
|
|
||||||
created_at: datetime
|
|
||||||
created_by_id: str
|
|
||||||
created_by_username: str
|
|
||||||
|
|
||||||
uri: str
|
|
||||||
tags_str: str
|
|
||||||
config: dict
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def resolve_created_by_id(obj):
|
|
||||||
return str(obj.created_by_id)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def resolve_created_by_username(obj):
|
|
||||||
User = get_user_model()
|
|
||||||
return User.objects.get(id=obj.created_by_id).username
|
|
||||||
|
|
||||||
@router.get("/seeds", response=List[SeedSchema], url_name="get_seeds")
|
|
||||||
def get_seeds(request):
|
|
||||||
return Seed.objects.all().distinct()
|
|
||||||
|
|
||||||
@router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
|
|
||||||
def get_seed(request, seed_id: str):
|
|
||||||
seed = None
|
|
||||||
request.with_snapshots = False
|
|
||||||
request.with_archiveresults = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
seed = Seed.objects.get(Q(id__icontains=seed_id))
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return seed
|
|
||||||
|
|
||||||
|
|
||||||
class CrawlSchema(Schema):
|
class CrawlSchema(Schema):
|
||||||
TYPE: str = 'crawls.models.Crawl'
|
TYPE: str = 'crawls.models.Crawl'
|
||||||
|
|
||||||
@@ -70,8 +30,11 @@ class CrawlSchema(Schema):
|
|||||||
status: str
|
status: str
|
||||||
retry_at: datetime | None
|
retry_at: datetime | None
|
||||||
|
|
||||||
seed: SeedSchema
|
urls: str
|
||||||
|
extractor: str
|
||||||
max_depth: int
|
max_depth: int
|
||||||
|
tags_str: str
|
||||||
|
config: dict
|
||||||
|
|
||||||
# snapshots: List[SnapshotSchema]
|
# snapshots: List[SnapshotSchema]
|
||||||
|
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ class ArchiveBoxGroup(click.Group):
|
|||||||
meta_commands = {
|
meta_commands = {
|
||||||
'help': 'archivebox.cli.archivebox_help.main',
|
'help': 'archivebox.cli.archivebox_help.main',
|
||||||
'version': 'archivebox.cli.archivebox_version.main',
|
'version': 'archivebox.cli.archivebox_version.main',
|
||||||
|
'mcp': 'archivebox.cli.archivebox_mcp.main',
|
||||||
}
|
}
|
||||||
setup_commands = {
|
setup_commands = {
|
||||||
'init': 'archivebox.cli.archivebox_init.main',
|
'init': 'archivebox.cli.archivebox_init.main',
|
||||||
|
|||||||
@@ -36,15 +36,14 @@ def add(urls: str | list[str],
|
|||||||
created_by_id: int | None=None) -> QuerySet['Snapshot']:
|
created_by_id: int | None=None) -> QuerySet['Snapshot']:
|
||||||
"""Add a new URL or list of URLs to your archive.
|
"""Add a new URL or list of URLs to your archive.
|
||||||
|
|
||||||
The new flow is:
|
The flow is:
|
||||||
1. Save URLs to sources file
|
1. Save URLs to sources file
|
||||||
2. Create Seed pointing to the file
|
2. Create Crawl with URLs and max_depth
|
||||||
3. Create Crawl with max_depth
|
3. Orchestrator creates Snapshots from Crawl URLs (depth=0)
|
||||||
4. Create root Snapshot pointing to file:// URL (depth=0)
|
4. Orchestrator runs parser extractors on root snapshots
|
||||||
5. Orchestrator runs parser extractors on root snapshot
|
5. Parser extractors output to urls.jsonl
|
||||||
6. Parser extractors output to urls.jsonl
|
6. URLs are added to Crawl.urls and child Snapshots are created
|
||||||
7. URLs are added to Crawl.urls and child Snapshots are created
|
7. Repeat until max_depth is reached
|
||||||
8. Repeat until max_depth is reached
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from rich import print
|
from rich import print
|
||||||
@@ -55,7 +54,7 @@ def add(urls: str | list[str],
|
|||||||
|
|
||||||
# import models once django is set up
|
# import models once django is set up
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
from crawls.models import Seed, Crawl
|
from crawls.models import Crawl
|
||||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
from workers.orchestrator import Orchestrator
|
from workers.orchestrator import Orchestrator
|
||||||
|
|
||||||
@@ -66,19 +65,24 @@ def add(urls: str | list[str],
|
|||||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
||||||
|
|
||||||
# 2. Create a new Seed pointing to the sources file
|
# 2. Create a new Crawl with inline URLs
|
||||||
cli_args = [*sys.argv]
|
cli_args = [*sys.argv]
|
||||||
if cli_args[0].lower().endswith('archivebox'):
|
if cli_args[0].lower().endswith('archivebox'):
|
||||||
cli_args[0] = 'archivebox'
|
cli_args[0] = 'archivebox'
|
||||||
cmd_str = ' '.join(cli_args)
|
cmd_str = ' '.join(cli_args)
|
||||||
|
|
||||||
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||||
seed = Seed.from_file(
|
|
||||||
sources_file,
|
# Read URLs directly into crawl
|
||||||
|
urls_content = sources_file.read_text()
|
||||||
|
|
||||||
|
crawl = Crawl.objects.create(
|
||||||
|
urls=urls_content,
|
||||||
|
extractor=parser,
|
||||||
|
max_depth=depth,
|
||||||
|
tags_str=tag,
|
||||||
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
|
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
|
||||||
parser=parser,
|
created_by_id=created_by_id,
|
||||||
tag=tag,
|
|
||||||
created_by=created_by_id,
|
|
||||||
config={
|
config={
|
||||||
'ONLY_NEW': not update,
|
'ONLY_NEW': not update,
|
||||||
'INDEX_ONLY': index_only,
|
'INDEX_ONLY': index_only,
|
||||||
@@ -88,15 +92,13 @@ def add(urls: str | list[str],
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# 3. Create a new Crawl pointing to the Seed (status=queued)
|
|
||||||
crawl = Crawl.from_seed(seed, max_depth=depth)
|
|
||||||
|
|
||||||
print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
|
print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
|
||||||
print(f' [dim]Seed: {seed.uri}[/dim]')
|
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
|
||||||
|
print(f' [dim]First URL: {first_url}[/dim]')
|
||||||
|
|
||||||
# 4. The CrawlMachine will create the root Snapshot when started
|
# 3. The CrawlMachine will create the root Snapshot when started
|
||||||
# Root snapshot URL = file:///path/to/sources/...txt
|
# If URLs are from a file: first URL = file:///path/to/sources/...txt
|
||||||
# Parser extractors will run on it and discover URLs
|
# Parser extractors will run on it and discover more URLs
|
||||||
# Those URLs become child Snapshots (depth=1)
|
# Those URLs become child Snapshots (depth=1)
|
||||||
|
|
||||||
if index_only:
|
if index_only:
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ def discover_outlinks(
|
|||||||
)
|
)
|
||||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
from core.models import Snapshot, ArchiveResult
|
from core.models import Snapshot, ArchiveResult
|
||||||
from crawls.models import Seed, Crawl
|
from crawls.models import Crawl
|
||||||
from archivebox.config import CONSTANTS
|
from archivebox.config import CONSTANTS
|
||||||
from workers.orchestrator import Orchestrator
|
from workers.orchestrator import Orchestrator
|
||||||
|
|
||||||
@@ -117,12 +117,12 @@ def discover_outlinks(
|
|||||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url')))
|
sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url')))
|
||||||
|
|
||||||
seed = Seed.from_file(
|
crawl = Crawl.from_file(
|
||||||
sources_file,
|
sources_file,
|
||||||
|
max_depth=depth,
|
||||||
label=f'crawl --depth={depth}',
|
label=f'crawl --depth={depth}',
|
||||||
created_by=created_by_id,
|
created_by=created_by_id,
|
||||||
)
|
)
|
||||||
crawl = Crawl.from_seed(seed, max_depth=depth)
|
|
||||||
|
|
||||||
# Create snapshots for new URLs
|
# Create snapshots for new URLs
|
||||||
for record in new_url_records:
|
for record in new_url_records:
|
||||||
|
|||||||
@@ -42,27 +42,20 @@ def install(dry_run: bool=False) -> None:
|
|||||||
setup_django()
|
setup_django()
|
||||||
|
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from crawls.models import Seed, Crawl
|
from crawls.models import Crawl
|
||||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
|
|
||||||
# Create a seed and crawl for dependency detection
|
# Create a crawl for dependency detection
|
||||||
# Using a minimal crawl that will trigger on_Crawl hooks
|
# Using a minimal crawl that will trigger on_Crawl hooks
|
||||||
created_by_id = get_or_create_system_user_pk()
|
created_by_id = get_or_create_system_user_pk()
|
||||||
|
|
||||||
seed, _created = Seed.objects.get_or_create(
|
crawl, created = Crawl.objects.get_or_create(
|
||||||
uri='archivebox://install',
|
urls='archivebox://install',
|
||||||
label='Dependency detection',
|
label='Dependency detection',
|
||||||
created_by_id=created_by_id,
|
created_by_id=created_by_id,
|
||||||
defaults={
|
defaults={
|
||||||
'extractor': 'auto',
|
'extractor': 'auto',
|
||||||
}
|
'max_depth': 0,
|
||||||
)
|
|
||||||
|
|
||||||
crawl, created = Crawl.objects.get_or_create(
|
|
||||||
seed=seed,
|
|
||||||
max_depth=0,
|
|
||||||
created_by_id=created_by_id,
|
|
||||||
defaults={
|
|
||||||
'status': 'queued',
|
'status': 'queued',
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -92,7 +92,7 @@ def create_snapshots(
|
|||||||
)
|
)
|
||||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
from crawls.models import Seed, Crawl
|
from crawls.models import Crawl
|
||||||
from archivebox.config import CONSTANTS
|
from archivebox.config import CONSTANTS
|
||||||
|
|
||||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||||
@@ -108,17 +108,17 @@ def create_snapshots(
|
|||||||
# If depth > 0, we need a Crawl to manage recursive discovery
|
# If depth > 0, we need a Crawl to manage recursive discovery
|
||||||
crawl = None
|
crawl = None
|
||||||
if depth > 0:
|
if depth > 0:
|
||||||
# Create a seed for this batch
|
# Create a crawl for this batch
|
||||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt'
|
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt'
|
||||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url')))
|
sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url')))
|
||||||
|
|
||||||
seed = Seed.from_file(
|
crawl = Crawl.from_file(
|
||||||
sources_file,
|
sources_file,
|
||||||
|
max_depth=depth,
|
||||||
label=f'snapshot --depth={depth}',
|
label=f'snapshot --depth={depth}',
|
||||||
created_by=created_by_id,
|
created_by=created_by_id,
|
||||||
)
|
)
|
||||||
crawl = Crawl.from_seed(seed, max_depth=depth)
|
|
||||||
|
|
||||||
# Process each record
|
# Process each record
|
||||||
created_snapshots = []
|
created_snapshots = []
|
||||||
|
|||||||
@@ -111,53 +111,27 @@ def version(quiet: bool=False,
|
|||||||
|
|
||||||
machine = Machine.current()
|
machine = Machine.current()
|
||||||
|
|
||||||
# Get all *_BINARY config values
|
# Get all installed binaries from the database
|
||||||
binary_config_keys = [key for key in config.keys() if key.endswith('_BINARY')]
|
all_installed = InstalledBinary.objects.filter(
|
||||||
|
machine=machine
|
||||||
|
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
|
||||||
|
|
||||||
if not binary_config_keys:
|
if not all_installed.exists():
|
||||||
prnt('', '[grey53]No binary dependencies defined in config.[/grey53]')
|
prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
|
||||||
else:
|
else:
|
||||||
for key in sorted(set(binary_config_keys)):
|
for installed in all_installed:
|
||||||
# Get the actual binary name/path from config value
|
# Skip if user specified specific binaries and this isn't one
|
||||||
# Prioritize Machine.config overrides over base config
|
if binaries and installed.name not in binaries:
|
||||||
bin_value = machine.config.get(key) or config.get(key, '').strip()
|
|
||||||
if not bin_value:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check if it's a path (has slashes) or just a name
|
if installed.is_valid:
|
||||||
is_path = '/' in str(bin_value)
|
|
||||||
|
|
||||||
if is_path:
|
|
||||||
# It's a full path - match against abspath
|
|
||||||
bin_name = Path(bin_value).name
|
|
||||||
# Skip if user specified specific binaries and this isn't one
|
|
||||||
if binaries and bin_name not in binaries:
|
|
||||||
continue
|
|
||||||
# Find InstalledBinary where abspath ends with this path
|
|
||||||
installed = InstalledBinary.objects.filter(
|
|
||||||
machine=machine,
|
|
||||||
abspath__endswith=bin_value,
|
|
||||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
|
|
||||||
else:
|
|
||||||
# It's just a binary name - match against name
|
|
||||||
bin_name = bin_value
|
|
||||||
# Skip if user specified specific binaries and this isn't one
|
|
||||||
if binaries and bin_name not in binaries:
|
|
||||||
continue
|
|
||||||
# Find InstalledBinary by name
|
|
||||||
installed = InstalledBinary.objects.filter(
|
|
||||||
machine=machine,
|
|
||||||
name__iexact=bin_name,
|
|
||||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
|
|
||||||
|
|
||||||
if installed and installed.is_valid:
|
|
||||||
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
|
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
|
||||||
version_str = (installed.version or 'unknown')[:15]
|
version_str = (installed.version or 'unknown')[:15]
|
||||||
provider = (installed.binprovider or 'env')[:8]
|
provider = (installed.binprovider or 'env')[:8]
|
||||||
prnt('', '[green]√[/green]', '', bin_name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
|
prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
|
||||||
else:
|
else:
|
||||||
prnt('', '[red]X[/red]', '', bin_name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
|
prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
|
||||||
failures.append(bin_name)
|
failures.append(installed.name)
|
||||||
|
|
||||||
# Show hint if no binaries are installed yet
|
# Show hint if no binaries are installed yet
|
||||||
has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()
|
has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()
|
||||||
|
|||||||
@@ -96,10 +96,8 @@ class ConstantsDict(Mapping):
|
|||||||
# Data dir files
|
# Data dir files
|
||||||
CONFIG_FILENAME: str = 'ArchiveBox.conf'
|
CONFIG_FILENAME: str = 'ArchiveBox.conf'
|
||||||
SQL_INDEX_FILENAME: str = 'index.sqlite3'
|
SQL_INDEX_FILENAME: str = 'index.sqlite3'
|
||||||
QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
|
|
||||||
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
|
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
|
||||||
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
|
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
|
||||||
QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
|
|
||||||
|
|
||||||
JSON_INDEX_FILENAME: str = 'index.json'
|
JSON_INDEX_FILENAME: str = 'index.json'
|
||||||
HTML_INDEX_FILENAME: str = 'index.html'
|
HTML_INDEX_FILENAME: str = 'index.html'
|
||||||
@@ -184,10 +182,10 @@ class ConstantsDict(Mapping):
|
|||||||
SQL_INDEX_FILENAME,
|
SQL_INDEX_FILENAME,
|
||||||
f"{SQL_INDEX_FILENAME}-wal",
|
f"{SQL_INDEX_FILENAME}-wal",
|
||||||
f"{SQL_INDEX_FILENAME}-shm",
|
f"{SQL_INDEX_FILENAME}-shm",
|
||||||
QUEUE_DATABASE_FILENAME,
|
|
||||||
f"{QUEUE_DATABASE_FILENAME}-wal",
|
|
||||||
f"{QUEUE_DATABASE_FILENAME}-shm",
|
|
||||||
"search.sqlite3",
|
"search.sqlite3",
|
||||||
|
"queue.sqlite3",
|
||||||
|
"queue.sqlite3-wal",
|
||||||
|
"queue.sqlite3-shm",
|
||||||
JSON_INDEX_FILENAME,
|
JSON_INDEX_FILENAME,
|
||||||
HTML_INDEX_FILENAME,
|
HTML_INDEX_FILENAME,
|
||||||
ROBOTS_TXT_FILENAME,
|
ROBOTS_TXT_FILENAME,
|
||||||
|
|||||||
@@ -56,6 +56,14 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
|||||||
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
|
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
|
||||||
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
|
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
|
||||||
|
|
||||||
|
# Suppress the "database access during app initialization" warning
|
||||||
|
# This warning can be triggered during django.setup() but is safe to ignore
|
||||||
|
# since we're doing intentional setup operations
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings('ignore',
|
||||||
|
message='.*Accessing the database during app initialization.*',
|
||||||
|
category=RuntimeWarning)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from django.core.management import call_command
|
from django.core.management import call_command
|
||||||
|
|
||||||
@@ -87,7 +95,8 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
|||||||
style='bold red',
|
style='bold red',
|
||||||
))
|
))
|
||||||
STDERR.print()
|
STDERR.print()
|
||||||
STDERR.print_exception(show_locals=False)
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
return
|
return
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|||||||
@@ -224,12 +224,6 @@ def get_data_locations():
|
|||||||
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
|
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
|
||||||
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
|
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
|
||||||
},
|
},
|
||||||
"QUEUE_DATABASE": {
|
|
||||||
"path": CONSTANTS.QUEUE_DATABASE_FILE,
|
|
||||||
"enabled": True,
|
|
||||||
"is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK),
|
|
||||||
"is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE),
|
|
||||||
},
|
|
||||||
"ARCHIVE_DIR": {
|
"ARCHIVE_DIR": {
|
||||||
"path": ARCHIVE_DIR.resolve(),
|
"path": ARCHIVE_DIR.resolve(),
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
|
|||||||
@@ -33,15 +33,18 @@ GLOBAL_CONTEXT = {}
|
|||||||
|
|
||||||
|
|
||||||
class SnapshotActionForm(ActionForm):
|
class SnapshotActionForm(ActionForm):
|
||||||
tags = forms.ModelMultipleChoiceField(
|
def __init__(self, *args, **kwargs):
|
||||||
label='Edit tags',
|
super().__init__(*args, **kwargs)
|
||||||
queryset=Tag.objects.all(),
|
# Define tags field in __init__ to avoid database access during app initialization
|
||||||
required=False,
|
self.fields['tags'] = forms.ModelMultipleChoiceField(
|
||||||
widget=FilteredSelectMultiple(
|
label='Edit tags',
|
||||||
'core_tag__name',
|
queryset=Tag.objects.all(),
|
||||||
False,
|
required=False,
|
||||||
),
|
widget=FilteredSelectMultiple(
|
||||||
)
|
'core_tag__name',
|
||||||
|
False,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
# TODO: allow selecting actions for specific extractors? is this useful?
|
# TODO: allow selecting actions for specific extractors? is this useful?
|
||||||
# extractor = forms.ChoiceField(
|
# extractor = forms.ChoiceField(
|
||||||
@@ -165,14 +168,69 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
|||||||
|
|
||||||
def admin_actions(self, obj):
|
def admin_actions(self, obj):
|
||||||
return format_html(
|
return format_html(
|
||||||
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
|
|
||||||
'''
|
'''
|
||||||
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}">Summary page ➡️</a>
|
<div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
|
||||||
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}/index.html#all">Result files 📑</a>
|
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||||
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/admin/core/snapshot/?id__exact={}">Admin actions ⚙️</a>
|
href="/archive/{}"
|
||||||
|
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||||
|
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||||
|
📄 Summary Page
|
||||||
|
</a>
|
||||||
|
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||||
|
href="/archive/{}/index.html#all"
|
||||||
|
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||||
|
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||||
|
📁 Result Files
|
||||||
|
</a>
|
||||||
|
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||||
|
href="{}"
|
||||||
|
target="_blank"
|
||||||
|
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||||
|
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||||
|
🔗 Original URL
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<span style="border-left: 1px solid #e2e8f0; height: 24px; margin: 0 4px;"></span>
|
||||||
|
|
||||||
|
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||||
|
href="/admin/core/snapshot/?id__exact={}"
|
||||||
|
title="Get missing extractors"
|
||||||
|
onmouseover="this.style.background='#d1fae5';"
|
||||||
|
onmouseout="this.style.background='#ecfdf5';">
|
||||||
|
⬇️ Get Missing
|
||||||
|
</a>
|
||||||
|
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #eff6ff; border: 1px solid #bfdbfe; border-radius: 8px; color: #1e40af; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||||
|
href="/admin/core/snapshot/?id__exact={}"
|
||||||
|
title="Create a fresh new snapshot of this URL"
|
||||||
|
onmouseover="this.style.background='#dbeafe';"
|
||||||
|
onmouseout="this.style.background='#eff6ff';">
|
||||||
|
🆕 Archive Again
|
||||||
|
</a>
|
||||||
|
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px; color: #92400e; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||||
|
href="/admin/core/snapshot/?id__exact={}"
|
||||||
|
title="Re-run all extractors (overwrite existing)"
|
||||||
|
onmouseover="this.style.background='#fef3c7';"
|
||||||
|
onmouseout="this.style.background='#fffbeb';">
|
||||||
|
🔄 Redo All
|
||||||
|
</a>
|
||||||
|
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fef2f2; border: 1px solid #fecaca; border-radius: 8px; color: #991b1b; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||||
|
href="/admin/core/snapshot/?id__exact={}"
|
||||||
|
title="Permanently delete this snapshot"
|
||||||
|
onmouseover="this.style.background='#fee2e2';"
|
||||||
|
onmouseout="this.style.background='#fef2f2';">
|
||||||
|
☠️ Delete
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
<p style="margin-top: 12px; font-size: 12px; color: #64748b;">
|
||||||
|
<b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
|
||||||
|
</p>
|
||||||
''',
|
''',
|
||||||
obj.timestamp,
|
obj.timestamp,
|
||||||
obj.timestamp,
|
obj.timestamp,
|
||||||
|
obj.url,
|
||||||
|
obj.pk,
|
||||||
|
obj.pk,
|
||||||
|
obj.pk,
|
||||||
obj.pk,
|
obj.pk,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,101 @@
|
|||||||
|
# Generated by Django 6.0 on 2025-12-25 09:34
|
||||||
|
|
||||||
|
import archivebox.base_models.models
|
||||||
|
import django.db.models.deletion
|
||||||
|
import django.utils.timezone
|
||||||
|
import uuid
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('core', '0025_allow_duplicate_urls_per_crawl'),
|
||||||
|
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='output_dir',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='snapshot',
|
||||||
|
name='output_dir',
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='created_at',
|
||||||
|
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='created_by',
|
||||||
|
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='extractor',
|
||||||
|
field=models.CharField(db_index=True, max_length=32),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='id',
|
||||||
|
field=models.AutoField(editable=False, primary_key=True, serialize=False),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='status',
|
||||||
|
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='uuid',
|
||||||
|
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True, unique=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='snapshot',
|
||||||
|
name='bookmarked_at',
|
||||||
|
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='snapshot',
|
||||||
|
name='created_at',
|
||||||
|
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='snapshot',
|
||||||
|
name='created_by',
|
||||||
|
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='snapshot',
|
||||||
|
name='downloaded_at',
|
||||||
|
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='snapshot',
|
||||||
|
name='id',
|
||||||
|
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||||
|
),
|
||||||
|
# migrations.AlterField(
|
||||||
|
# model_name='snapshot',
|
||||||
|
# name='tags',
|
||||||
|
# field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
|
||||||
|
# ),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='snapshottag',
|
||||||
|
name='id',
|
||||||
|
field=models.AutoField(primary_key=True, serialize=False),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='tag',
|
||||||
|
name='created_by',
|
||||||
|
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
|
||||||
|
),
|
||||||
|
migrations.AlterUniqueTogether(
|
||||||
|
name='snapshottag',
|
||||||
|
unique_together={('snapshot', 'tag')},
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -59,7 +59,7 @@ INSTALLED_APPS = [
|
|||||||
"config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
|
"config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
|
||||||
"machine", # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
|
"machine", # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
|
||||||
"workers", # handles starting and managing background workers and processes (orchestrators and actors)
|
"workers", # handles starting and managing background workers and processes (orchestrators and actors)
|
||||||
"crawls", # handles Seed, Crawl, and CrawlSchedule models and management
|
"crawls", # handles Crawl and CrawlSchedule models and management
|
||||||
"personas", # handles Persona and session management
|
"personas", # handles Persona and session management
|
||||||
"core", # core django model with Snapshot, ArchiveResult, etc.
|
"core", # core django model with Snapshot, ArchiveResult, etc.
|
||||||
"api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
"api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
||||||
@@ -194,10 +194,6 @@ DATABASES = {
|
|||||||
"NAME": DATABASE_NAME,
|
"NAME": DATABASE_NAME,
|
||||||
**SQLITE_CONNECTION_OPTIONS,
|
**SQLITE_CONNECTION_OPTIONS,
|
||||||
},
|
},
|
||||||
"queue": {
|
|
||||||
"NAME": CONSTANTS.QUEUE_DATABASE_FILE,
|
|
||||||
**SQLITE_CONNECTION_OPTIONS,
|
|
||||||
},
|
|
||||||
# "filestore": {
|
# "filestore": {
|
||||||
# "NAME": CONSTANTS.FILESTORE_DATABASE_FILE,
|
# "NAME": CONSTANTS.FILESTORE_DATABASE_FILE,
|
||||||
# **SQLITE_CONNECTION_OPTIONS,
|
# **SQLITE_CONNECTION_OPTIONS,
|
||||||
|
|||||||
@@ -2,8 +2,6 @@ __package__ = 'archivebox.core'
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import shutil
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
@@ -11,7 +9,6 @@ import pydantic
|
|||||||
import django.template
|
import django.template
|
||||||
|
|
||||||
from archivebox.config import CONSTANTS
|
from archivebox.config import CONSTANTS
|
||||||
from archivebox.misc.logging import IS_TTY
|
|
||||||
|
|
||||||
|
|
||||||
IGNORABLE_URL_PATTERNS = [
|
IGNORABLE_URL_PATTERNS = [
|
||||||
@@ -79,7 +76,6 @@ SETTINGS_LOGGING = {
|
|||||||
"formatters": {
|
"formatters": {
|
||||||
"rich": {
|
"rich": {
|
||||||
"datefmt": "[%Y-%m-%d %H:%M:%S]",
|
"datefmt": "[%Y-%m-%d %H:%M:%S]",
|
||||||
# "format": "{asctime} {levelname} {module} {name} {message} {username}",
|
|
||||||
"format": "%(name)s %(message)s",
|
"format": "%(name)s %(message)s",
|
||||||
},
|
},
|
||||||
"outbound_webhooks": {
|
"outbound_webhooks": {
|
||||||
@@ -99,26 +95,13 @@ SETTINGS_LOGGING = {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
"handlers": {
|
"handlers": {
|
||||||
# "console": {
|
|
||||||
# "level": "DEBUG",
|
|
||||||
# 'formatter': 'simple',
|
|
||||||
# "class": "logging.StreamHandler",
|
|
||||||
# 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'],
|
|
||||||
# },
|
|
||||||
"default": {
|
"default": {
|
||||||
"class": "rich.logging.RichHandler",
|
"class": "rich.logging.RichHandler",
|
||||||
"formatter": "rich",
|
"formatter": "rich",
|
||||||
"level": "DEBUG",
|
"level": "DEBUG",
|
||||||
"markup": False,
|
"markup": False,
|
||||||
"rich_tracebacks": IS_TTY,
|
"rich_tracebacks": False, # Use standard Python tracebacks (no frame/box)
|
||||||
"filters": ["noisyrequestsfilter"],
|
"filters": ["noisyrequestsfilter"],
|
||||||
"tracebacks_suppress": [
|
|
||||||
django,
|
|
||||||
pydantic,
|
|
||||||
],
|
|
||||||
"tracebacks_width": shutil.get_terminal_size((100, 10)).columns - 1,
|
|
||||||
"tracebacks_word_wrap": False,
|
|
||||||
"tracebacks_show_locals": False,
|
|
||||||
},
|
},
|
||||||
"logfile": {
|
"logfile": {
|
||||||
"level": "INFO",
|
"level": "INFO",
|
||||||
@@ -132,7 +115,7 @@ SETTINGS_LOGGING = {
|
|||||||
"outbound_webhooks": {
|
"outbound_webhooks": {
|
||||||
"class": "rich.logging.RichHandler",
|
"class": "rich.logging.RichHandler",
|
||||||
"markup": False,
|
"markup": False,
|
||||||
"rich_tracebacks": True,
|
"rich_tracebacks": False, # Use standard Python tracebacks (no frame/box)
|
||||||
"formatter": "outbound_webhooks",
|
"formatter": "outbound_webhooks",
|
||||||
},
|
},
|
||||||
# "mail_admins": {
|
# "mail_admins": {
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ from statemachine import State, StateMachine
|
|||||||
# from workers.actor import ActorType
|
# from workers.actor import ActorType
|
||||||
|
|
||||||
from core.models import Snapshot, ArchiveResult
|
from core.models import Snapshot, ArchiveResult
|
||||||
from crawls.models import Crawl, Seed
|
from crawls.models import Crawl
|
||||||
|
|
||||||
|
|
||||||
class SnapshotMachine(StateMachine, strict_states=True):
|
class SnapshotMachine(StateMachine, strict_states=True):
|
||||||
@@ -247,17 +247,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
|||||||
)
|
)
|
||||||
self.archiveresult.save(write_indexes=True)
|
self.archiveresult.save(write_indexes=True)
|
||||||
|
|
||||||
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed
|
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
|
||||||
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||||
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||||
|
|
||||||
# Also update Crawl and Seed health stats if snapshot has a crawl
|
# Also update Crawl health stats if snapshot has a crawl
|
||||||
snapshot = self.archiveresult.snapshot
|
snapshot = self.archiveresult.snapshot
|
||||||
if snapshot.crawl_id:
|
if snapshot.crawl_id:
|
||||||
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||||
crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first()
|
|
||||||
if crawl:
|
|
||||||
Seed.objects.filter(pk=crawl).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
|
||||||
|
|
||||||
@failed.enter
|
@failed.enter
|
||||||
def enter_failed(self):
|
def enter_failed(self):
|
||||||
@@ -268,17 +265,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
|||||||
end_ts=timezone.now(),
|
end_ts=timezone.now(),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed
|
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
|
||||||
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
|
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||||
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)
|
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||||
|
|
||||||
# Also update Crawl and Seed health stats if snapshot has a crawl
|
# Also update Crawl health stats if snapshot has a crawl
|
||||||
snapshot = self.archiveresult.snapshot
|
snapshot = self.archiveresult.snapshot
|
||||||
if snapshot.crawl_id:
|
if snapshot.crawl_id:
|
||||||
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
|
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||||
crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first()
|
|
||||||
if crawl:
|
|
||||||
Seed.objects.filter(pk=crawl).update(num_uses_failed=F('num_uses_failed') + 1)
|
|
||||||
|
|
||||||
@skipped.enter
|
@skipped.enter
|
||||||
def enter_skipped(self):
|
def enter_skipped(self):
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ from archivebox.search import query_search_index
|
|||||||
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
from core.forms import AddLinkForm
|
from core.forms import AddLinkForm
|
||||||
from crawls.models import Seed, Crawl
|
from crawls.models import Crawl
|
||||||
from archivebox.hooks import get_extractors, get_extractor_name
|
from archivebox.hooks import get_extractors, get_extractor_name
|
||||||
|
|
||||||
|
|
||||||
@@ -119,7 +119,11 @@ class SnapshotView(View):
|
|||||||
if result_file.name in existing_files or result_file.name == 'index.html':
|
if result_file.name in existing_files or result_file.name == 'index.html':
|
||||||
continue
|
continue
|
||||||
|
|
||||||
file_size = result_file.stat().st_size or 0
|
# Skip circular symlinks and other stat() failures
|
||||||
|
try:
|
||||||
|
file_size = result_file.stat().st_size or 0
|
||||||
|
except OSError:
|
||||||
|
continue
|
||||||
|
|
||||||
if file_size > min_size_threshold:
|
if file_size > min_size_threshold:
|
||||||
archiveresults[result_file.name] = {
|
archiveresults[result_file.name] = {
|
||||||
@@ -471,14 +475,16 @@ class AddView(UserPassesTestMixin, FormView):
|
|||||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
|
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
|
||||||
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
||||||
|
|
||||||
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
|
# 2. create a new Crawl with the URLs from the file
|
||||||
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||||
seed = Seed.from_file(
|
urls_content = sources_file.read_text()
|
||||||
sources_file,
|
crawl = Crawl.objects.create(
|
||||||
|
urls=urls_content,
|
||||||
|
extractor=parser,
|
||||||
|
max_depth=depth,
|
||||||
|
tags_str=tag,
|
||||||
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
|
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
|
||||||
parser=parser,
|
created_by_id=self.request.user.pk,
|
||||||
tag=tag,
|
|
||||||
created_by=self.request.user.pk,
|
|
||||||
config={
|
config={
|
||||||
# 'ONLY_NEW': not update,
|
# 'ONLY_NEW': not update,
|
||||||
# 'INDEX_ONLY': index_only,
|
# 'INDEX_ONLY': index_only,
|
||||||
@@ -486,9 +492,8 @@ class AddView(UserPassesTestMixin, FormView):
|
|||||||
'DEPTH': depth,
|
'DEPTH': depth,
|
||||||
'EXTRACTORS': extractors or '',
|
'EXTRACTORS': extractors or '',
|
||||||
# 'DEFAULT_PERSONA': persona or 'Default',
|
# 'DEFAULT_PERSONA': persona or 'Default',
|
||||||
})
|
}
|
||||||
# 3. create a new Crawl pointing to the Seed
|
)
|
||||||
crawl = Crawl.from_seed(seed, max_depth=depth)
|
|
||||||
|
|
||||||
# 4. start the Orchestrator & wait until it completes
|
# 4. start the Orchestrator & wait until it completes
|
||||||
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
|
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
|
||||||
@@ -569,19 +574,7 @@ def live_progress_view(request):
|
|||||||
# Count URLs in the crawl (for when snapshots haven't been created yet)
|
# Count URLs in the crawl (for when snapshots haven't been created yet)
|
||||||
urls_count = 0
|
urls_count = 0
|
||||||
if crawl.urls:
|
if crawl.urls:
|
||||||
urls_count = len([u for u in crawl.urls.split('\n') if u.strip()])
|
urls_count = len([u for u in crawl.urls.split('\n') if u.strip() and not u.startswith('#')])
|
||||||
elif crawl.seed and crawl.seed.uri:
|
|
||||||
# Try to get URL count from seed
|
|
||||||
if crawl.seed.uri.startswith('file:///'):
|
|
||||||
try:
|
|
||||||
from pathlib import Path
|
|
||||||
seed_file = Path(crawl.seed.uri.replace('file://', ''))
|
|
||||||
if seed_file.exists():
|
|
||||||
urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')])
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
urls_count = 1 # Single URL seed
|
|
||||||
|
|
||||||
# Calculate crawl progress
|
# Calculate crawl progress
|
||||||
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
|
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
|
||||||
@@ -635,8 +628,8 @@ def live_progress_view(request):
|
|||||||
})
|
})
|
||||||
|
|
||||||
# Check if crawl can start (for debugging stuck crawls)
|
# Check if crawl can start (for debugging stuck crawls)
|
||||||
can_start = bool(crawl.seed and crawl.seed.uri)
|
can_start = bool(crawl.urls)
|
||||||
seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None
|
urls_preview = crawl.urls[:60] if crawl.urls else None
|
||||||
|
|
||||||
# Check if retry_at is in the future (would prevent worker from claiming)
|
# Check if retry_at is in the future (would prevent worker from claiming)
|
||||||
retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
|
retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
|
||||||
@@ -657,7 +650,7 @@ def live_progress_view(request):
|
|||||||
'pending_snapshots': pending_snapshots,
|
'pending_snapshots': pending_snapshots,
|
||||||
'active_snapshots': active_snapshots_for_crawl,
|
'active_snapshots': active_snapshots_for_crawl,
|
||||||
'can_start': can_start,
|
'can_start': can_start,
|
||||||
'seed_uri': seed_uri,
|
'urls_preview': urls_preview,
|
||||||
'retry_at_future': retry_at_future,
|
'retry_at_future': retry_at_future,
|
||||||
'seconds_until_retry': seconds_until_retry,
|
'seconds_until_retry': seconds_until_retry,
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from django_object_actions import action
|
|||||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||||
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
from crawls.models import Seed, Crawl, CrawlSchedule
|
from crawls.models import Crawl, CrawlSchedule
|
||||||
|
|
||||||
|
|
||||||
def render_snapshots_list(snapshots_qs, limit=20):
|
def render_snapshots_list(snapshots_qs, limit=20):
|
||||||
@@ -136,16 +136,16 @@ def render_snapshots_list(snapshots_qs, limit=20):
|
|||||||
''')
|
''')
|
||||||
|
|
||||||
|
|
||||||
class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
|
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||||
list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
|
list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
|
||||||
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at')
|
||||||
search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls')
|
||||||
|
|
||||||
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
|
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'urls_editor')
|
||||||
|
|
||||||
fieldsets = (
|
fieldsets = (
|
||||||
('Source', {
|
('URLs', {
|
||||||
'fields': ('uri', 'contents'),
|
'fields': ('urls_editor',),
|
||||||
'classes': ('card', 'wide'),
|
'classes': ('card', 'wide'),
|
||||||
}),
|
}),
|
||||||
('Info', {
|
('Info', {
|
||||||
@@ -153,83 +153,7 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|||||||
'classes': ('card',),
|
'classes': ('card',),
|
||||||
}),
|
}),
|
||||||
('Settings', {
|
('Settings', {
|
||||||
'fields': ('extractor', 'config'),
|
'fields': ('max_depth', 'extractor', 'config'),
|
||||||
'classes': ('card',),
|
|
||||||
}),
|
|
||||||
('Metadata', {
|
|
||||||
'fields': ('created_by', 'created_at', 'modified_at'),
|
|
||||||
'classes': ('card',),
|
|
||||||
}),
|
|
||||||
('Crawls', {
|
|
||||||
'fields': ('scheduled_crawls', 'crawls'),
|
|
||||||
'classes': ('card',),
|
|
||||||
}),
|
|
||||||
('Snapshots', {
|
|
||||||
'fields': ('snapshots',),
|
|
||||||
'classes': ('card',),
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
|
|
||||||
list_filter = ('extractor', 'created_by')
|
|
||||||
ordering = ['-created_at']
|
|
||||||
list_per_page = 100
|
|
||||||
actions = ["delete_selected"]
|
|
||||||
|
|
||||||
def num_crawls(self, obj):
|
|
||||||
return obj.crawl_set.count()
|
|
||||||
|
|
||||||
def num_snapshots(self, obj):
|
|
||||||
return obj.snapshot_set.count()
|
|
||||||
|
|
||||||
def scheduled_crawls(self, obj):
|
|
||||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
|
||||||
(scheduledcrawl.admin_change_url, scheduledcrawl)
|
|
||||||
for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
|
|
||||||
)) or mark_safe('<i>No Scheduled Crawls yet...</i>')
|
|
||||||
|
|
||||||
def crawls(self, obj):
|
|
||||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
|
||||||
(crawl.admin_change_url, crawl)
|
|
||||||
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
|
|
||||||
)) or mark_safe('<i>No Crawls yet...</i>')
|
|
||||||
|
|
||||||
def snapshots(self, obj):
|
|
||||||
return render_snapshots_list(obj.snapshot_set.all())
|
|
||||||
|
|
||||||
def contents(self, obj):
|
|
||||||
source_file = obj.get_file_path()
|
|
||||||
if source_file:
|
|
||||||
contents = ""
|
|
||||||
try:
|
|
||||||
contents = source_file.read_text().strip()[:14_000]
|
|
||||||
except Exception as e:
|
|
||||||
contents = f'Error reading {source_file}: {e}'
|
|
||||||
|
|
||||||
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
|
|
||||||
|
|
||||||
return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|
||||||
list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
|
|
||||||
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
|
|
||||||
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
|
|
||||||
|
|
||||||
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
|
|
||||||
|
|
||||||
fieldsets = (
|
|
||||||
('URLs', {
|
|
||||||
'fields': ('seed_urls_editor',),
|
|
||||||
'classes': ('card', 'wide'),
|
|
||||||
}),
|
|
||||||
('Info', {
|
|
||||||
'fields': ('label', 'notes'),
|
|
||||||
'classes': ('card',),
|
|
||||||
}),
|
|
||||||
('Settings', {
|
|
||||||
'fields': ('max_depth', 'config'),
|
|
||||||
'classes': ('card',),
|
'classes': ('card',),
|
||||||
}),
|
}),
|
||||||
('Status', {
|
('Status', {
|
||||||
@@ -237,7 +161,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|||||||
'classes': ('card',),
|
'classes': ('card',),
|
||||||
}),
|
}),
|
||||||
('Relations', {
|
('Relations', {
|
||||||
'fields': ('seed', 'schedule', 'created_by'),
|
'fields': ('schedule', 'created_by'),
|
||||||
'classes': ('card',),
|
'classes': ('card',),
|
||||||
}),
|
}),
|
||||||
('Timestamps', {
|
('Timestamps', {
|
||||||
@@ -250,7 +174,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
|
|
||||||
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
|
list_filter = ('max_depth', 'extractor', 'schedule', 'created_by', 'status', 'retry_at')
|
||||||
ordering = ['-created_at', '-retry_at']
|
ordering = ['-created_at', '-retry_at']
|
||||||
list_per_page = 100
|
list_per_page = 100
|
||||||
actions = ["delete_selected"]
|
actions = ["delete_selected"]
|
||||||
@@ -258,23 +182,20 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|||||||
|
|
||||||
@action(label='Recrawl', description='Create a new crawl with the same settings')
|
@action(label='Recrawl', description='Create a new crawl with the same settings')
|
||||||
def recrawl(self, request, obj):
|
def recrawl(self, request, obj):
|
||||||
"""Duplicate this crawl as a new crawl with the same seed and settings."""
|
"""Duplicate this crawl as a new crawl with the same URLs and settings."""
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from django.shortcuts import redirect
|
from django.shortcuts import redirect
|
||||||
|
|
||||||
# Validate seed has a URI (required for crawl to start)
|
# Validate URLs (required for crawl to start)
|
||||||
if not obj.seed:
|
if not obj.urls:
|
||||||
messages.error(request, 'Cannot recrawl: original crawl has no seed.')
|
messages.error(request, 'Cannot recrawl: original crawl has no URLs.')
|
||||||
return redirect('admin:crawls_crawl_change', obj.id)
|
|
||||||
|
|
||||||
if not obj.seed.uri:
|
|
||||||
messages.error(request, 'Cannot recrawl: seed has no URI.')
|
|
||||||
return redirect('admin:crawls_crawl_change', obj.id)
|
return redirect('admin:crawls_crawl_change', obj.id)
|
||||||
|
|
||||||
new_crawl = Crawl.objects.create(
|
new_crawl = Crawl.objects.create(
|
||||||
seed=obj.seed,
|
|
||||||
urls=obj.urls,
|
urls=obj.urls,
|
||||||
|
extractor=obj.extractor,
|
||||||
max_depth=obj.max_depth,
|
max_depth=obj.max_depth,
|
||||||
|
tags_str=obj.tags_str,
|
||||||
config=obj.config,
|
config=obj.config,
|
||||||
schedule=obj.schedule,
|
schedule=obj.schedule,
|
||||||
label=f"{obj.label} (recrawl)" if obj.label else "",
|
label=f"{obj.label} (recrawl)" if obj.label else "",
|
||||||
@@ -292,43 +213,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|||||||
|
|
||||||
return redirect('admin:crawls_crawl_change', new_crawl.id)
|
return redirect('admin:crawls_crawl_change', new_crawl.id)
|
||||||
|
|
||||||
def get_urls(self):
|
|
||||||
urls = super().get_urls()
|
|
||||||
custom_urls = [
|
|
||||||
path('<path:object_id>/save_seed_contents/',
|
|
||||||
self.admin_site.admin_view(self.save_seed_contents_view),
|
|
||||||
name='crawls_crawl_save_seed_contents'),
|
|
||||||
]
|
|
||||||
return custom_urls + urls
|
|
||||||
|
|
||||||
def save_seed_contents_view(self, request, object_id):
|
|
||||||
"""Handle saving seed file contents via AJAX."""
|
|
||||||
if request.method != 'POST':
|
|
||||||
return JsonResponse({'success': False, 'error': 'POST required'}, status=405)
|
|
||||||
|
|
||||||
try:
|
|
||||||
crawl = Crawl.objects.get(pk=object_id)
|
|
||||||
except Crawl.DoesNotExist:
|
|
||||||
return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
|
|
||||||
|
|
||||||
source_file = crawl.seed.get_file_path() if crawl.seed else None
|
|
||||||
if not source_file:
|
|
||||||
return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
|
|
||||||
|
|
||||||
try:
|
|
||||||
data = json.loads(request.body)
|
|
||||||
contents = data.get('contents', '')
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Ensure parent directory exists
|
|
||||||
source_file.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
source_file.write_text(contents)
|
|
||||||
return JsonResponse({'success': True, 'message': f'Saved {len(contents)} bytes to {source_file.name}'})
|
|
||||||
except Exception as e:
|
|
||||||
return JsonResponse({'success': False, 'error': str(e)}, status=500)
|
|
||||||
|
|
||||||
def num_snapshots(self, obj):
|
def num_snapshots(self, obj):
|
||||||
return obj.snapshot_set.count()
|
return obj.snapshot_set.count()
|
||||||
|
|
||||||
@@ -341,163 +225,68 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|||||||
return mark_safe('<i>None</i>')
|
return mark_safe('<i>None</i>')
|
||||||
return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)
|
return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)
|
||||||
|
|
||||||
@admin.display(description='Seed', ordering='seed')
|
@admin.display(description='URLs', ordering='urls')
|
||||||
def seed_str(self, obj):
|
def urls_preview(self, obj):
|
||||||
if not obj.seed:
|
first_url = obj.get_urls_list()[0] if obj.get_urls_list() else ''
|
||||||
return mark_safe('<i>None</i>')
|
return first_url[:80] + '...' if len(first_url) > 80 else first_url
|
||||||
return format_html('<a href="{}">{}</a>', obj.seed.admin_change_url, obj.seed)
|
|
||||||
|
|
||||||
@admin.display(description='URLs')
|
@admin.display(description='URLs')
|
||||||
def seed_urls_editor(self, obj):
|
def urls_editor(self, obj):
|
||||||
"""Combined editor showing seed URL and file contents."""
|
"""Editor for crawl URLs."""
|
||||||
widget_id = f'seed_urls_{obj.pk}'
|
widget_id = f'crawl_urls_{obj.pk}'
|
||||||
|
|
||||||
# Get the seed URI (or use urls field if no seed)
|
|
||||||
seed_uri = ''
|
|
||||||
if obj.seed and obj.seed.uri:
|
|
||||||
seed_uri = obj.seed.uri
|
|
||||||
elif obj.urls:
|
|
||||||
seed_uri = obj.urls
|
|
||||||
|
|
||||||
# Check if it's a local file we can edit
|
# Check if it's a local file we can edit
|
||||||
source_file = obj.seed.get_file_path() if obj.seed else None
|
source_file = obj.get_file_path()
|
||||||
is_file = source_file is not None
|
is_file = source_file is not None
|
||||||
contents = ""
|
file_contents = ""
|
||||||
error = None
|
error = None
|
||||||
|
|
||||||
if is_file and source_file:
|
if is_file and source_file:
|
||||||
try:
|
try:
|
||||||
contents = source_file.read_text().strip()
|
file_contents = source_file.read_text().strip()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error = f'Error reading {source_file}: {e}'
|
error = f'Error reading {source_file}: {e}'
|
||||||
|
|
||||||
# Escape for safe HTML embedding
|
# Escape for safe HTML embedding
|
||||||
escaped_uri = seed_uri.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
escaped_urls = (obj.urls or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||||
escaped_contents = (contents or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
escaped_file_contents = file_contents.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||||
|
|
||||||
# Count lines for auto-expand logic
|
# Count lines for auto-expand logic
|
||||||
line_count = len(contents.split('\n')) if contents else 0
|
line_count = len((obj.urls or '').split('\n'))
|
||||||
uri_rows = min(max(1, seed_uri.count('\n') + 1), 3)
|
file_line_count = len(file_contents.split('\n')) if file_contents else 0
|
||||||
|
uri_rows = min(max(3, line_count), 10)
|
||||||
|
|
||||||
html = f'''
|
html = f'''
|
||||||
<div id="{widget_id}_container" style="max-width: 900px;">
|
<div id="{widget_id}_container" style="max-width: 900px;">
|
||||||
<!-- Seed URL input (auto-expands) -->
|
<!-- URLs input -->
|
||||||
<div style="margin-bottom: 12px;">
|
<div style="margin-bottom: 12px;">
|
||||||
<label style="font-weight: bold; display: block; margin-bottom: 4px;">Seed URL:</label>
|
<label style="font-weight: bold; display: block; margin-bottom: 4px;">URLs (one per line):</label>
|
||||||
<textarea id="{widget_id}_uri"
|
<textarea id="{widget_id}_urls"
|
||||||
style="width: 100%; font-family: monospace; font-size: 13px;
|
style="width: 100%; font-family: monospace; font-size: 13px;
|
||||||
padding: 8px; border: 1px solid #ccc; border-radius: 4px;
|
padding: 8px; border: 1px solid #ccc; border-radius: 4px;
|
||||||
resize: vertical; min-height: 32px; overflow: hidden;"
|
resize: vertical;"
|
||||||
rows="{uri_rows}"
|
rows="{uri_rows}"
|
||||||
placeholder="file:///data/sources/... or https://..."
|
placeholder="https://example.com https://example2.com # Comments start with #"
|
||||||
{"readonly" if not obj.pk else ""}>{escaped_uri}</textarea>
|
readonly>{escaped_urls}</textarea>
|
||||||
|
<p style="color: #666; font-size: 12px; margin: 4px 0 0 0;">
|
||||||
|
{line_count} URL{'s' if line_count != 1 else ''} · URLs are read-only in admin, edit via API or CLI
|
||||||
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{"" if not is_file else f'''
|
{"" if not is_file else f'''
|
||||||
<!-- File contents editor -->
|
<!-- File contents preview (if first URL is a file://) -->
|
||||||
<div style="margin-bottom: 8px;">
|
<div style="margin-bottom: 8px;">
|
||||||
<label style="font-weight: bold; display: block; margin-bottom: 4px;">
|
<label style="font-weight: bold; display: block; margin-bottom: 4px;">
|
||||||
File Contents: <code style="font-weight: normal; color: #666;">{source_file}</code>
|
File Preview: <code style="font-weight: normal; color: #666;">{source_file}</code>
|
||||||
</label>
|
</label>
|
||||||
{"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""}
|
{"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""}
|
||||||
<textarea id="{widget_id}_contents"
|
<textarea id="{widget_id}_file_preview"
|
||||||
style="width: 100%; height: {min(400, max(150, line_count * 18))}px; font-family: monospace; font-size: 12px;
|
style="width: 100%; height: {min(400, max(150, file_line_count * 18))}px; font-family: monospace; font-size: 12px;
|
||||||
padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical;"
|
padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical; background: #f9f9f9;"
|
||||||
placeholder="Enter URLs, one per line...">{escaped_contents}</textarea>
|
readonly>{escaped_file_contents}</textarea>
|
||||||
</div>
|
|
||||||
|
|
||||||
<div style="display: flex; gap: 12px; align-items: center; flex-wrap: wrap;">
|
|
||||||
<button type="button" id="{widget_id}_save_btn"
|
|
||||||
onclick="saveSeedUrls_{widget_id}()"
|
|
||||||
style="padding: 8px 20px; background: #417690; color: white; border: none;
|
|
||||||
border-radius: 4px; cursor: pointer; font-weight: bold;">
|
|
||||||
Save URLs
|
|
||||||
</button>
|
|
||||||
<span id="{widget_id}_line_count" style="color: #666; font-size: 12px;"></span>
|
|
||||||
<span id="{widget_id}_status" style="color: #666; font-size: 12px;"></span>
|
|
||||||
</div>
|
</div>
|
||||||
'''}
|
'''}
|
||||||
|
|
||||||
{"" if is_file else f'''
|
|
||||||
<div style="margin-top: 8px; color: #666;">
|
|
||||||
<a href="{seed_uri}" target="_blank">{seed_uri}</a>
|
|
||||||
</div>
|
|
||||||
'''}
|
|
||||||
|
|
||||||
<script>
|
|
||||||
(function() {{
|
|
||||||
var uriInput = document.getElementById('{widget_id}_uri');
|
|
||||||
var contentsInput = document.getElementById('{widget_id}_contents');
|
|
||||||
var status = document.getElementById('{widget_id}_status');
|
|
||||||
var lineCount = document.getElementById('{widget_id}_line_count');
|
|
||||||
var saveBtn = document.getElementById('{widget_id}_save_btn');
|
|
||||||
|
|
||||||
// Auto-resize URI input
|
|
||||||
function autoResizeUri() {{
|
|
||||||
uriInput.style.height = 'auto';
|
|
||||||
uriInput.style.height = Math.min(100, uriInput.scrollHeight) + 'px';
|
|
||||||
}}
|
|
||||||
uriInput.addEventListener('input', autoResizeUri);
|
|
||||||
autoResizeUri();
|
|
||||||
|
|
||||||
if (contentsInput) {{
|
|
||||||
function updateLineCount() {{
|
|
||||||
var lines = contentsInput.value.split('\\n').filter(function(l) {{ return l.trim(); }});
|
|
||||||
lineCount.textContent = lines.length + ' URLs';
|
|
||||||
}}
|
|
||||||
|
|
||||||
contentsInput.addEventListener('input', function() {{
|
|
||||||
updateLineCount();
|
|
||||||
if (status) {{
|
|
||||||
status.textContent = '(unsaved changes)';
|
|
||||||
status.style.color = '#c4820e';
|
|
||||||
}}
|
|
||||||
}});
|
|
||||||
|
|
||||||
updateLineCount();
|
|
||||||
}}
|
|
||||||
|
|
||||||
window.saveSeedUrls_{widget_id} = function() {{
|
|
||||||
if (!saveBtn) return;
|
|
||||||
saveBtn.disabled = true;
|
|
||||||
saveBtn.textContent = 'Saving...';
|
|
||||||
if (status) status.textContent = '';
|
|
||||||
|
|
||||||
fetch(window.location.pathname + 'save_seed_contents/', {{
|
|
||||||
method: 'POST',
|
|
||||||
headers: {{
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'X-CSRFToken': document.querySelector('[name=csrfmiddlewaretoken]').value
|
|
||||||
}},
|
|
||||||
body: JSON.stringify({{ contents: contentsInput ? contentsInput.value : '' }})
|
|
||||||
}})
|
|
||||||
.then(function(response) {{ return response.json(); }})
|
|
||||||
.then(function(data) {{
|
|
||||||
if (data.success) {{
|
|
||||||
if (status) {{
|
|
||||||
status.textContent = '✓ ' + data.message;
|
|
||||||
status.style.color = '#28a745';
|
|
||||||
}}
|
|
||||||
}} else {{
|
|
||||||
if (status) {{
|
|
||||||
status.textContent = '✗ ' + data.error;
|
|
||||||
status.style.color = '#dc3545';
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
}})
|
|
||||||
.catch(function(err) {{
|
|
||||||
if (status) {{
|
|
||||||
status.textContent = '✗ Error: ' + err;
|
|
||||||
status.style.color = '#dc3545';
|
|
||||||
}}
|
|
||||||
}})
|
|
||||||
.finally(function() {{
|
|
||||||
saveBtn.disabled = false;
|
|
||||||
saveBtn.textContent = 'Save URLs';
|
|
||||||
}});
|
|
||||||
}};
|
|
||||||
}})();
|
|
||||||
</script>
|
|
||||||
</div>
|
</div>
|
||||||
'''
|
'''
|
||||||
return mark_safe(html)
|
return mark_safe(html)
|
||||||
@@ -507,7 +296,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|||||||
class CrawlScheduleAdmin(BaseModelAdmin):
|
class CrawlScheduleAdmin(BaseModelAdmin):
|
||||||
list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots')
|
list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots')
|
||||||
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str')
|
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str')
|
||||||
search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri')
|
search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__urls')
|
||||||
|
|
||||||
readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots')
|
readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots')
|
||||||
|
|
||||||
@@ -561,6 +350,5 @@ class CrawlScheduleAdmin(BaseModelAdmin):
|
|||||||
|
|
||||||
|
|
||||||
def register_admin(admin_site):
|
def register_admin(admin_site):
|
||||||
admin_site.register(Seed, SeedAdmin)
|
|
||||||
admin_site.register(Crawl, CrawlAdmin)
|
admin_site.register(Crawl, CrawlAdmin)
|
||||||
admin_site.register(CrawlSchedule, CrawlScheduleAdmin)
|
admin_site.register(CrawlSchedule, CrawlScheduleAdmin)
|
||||||
|
|||||||
61
archivebox/crawls/migrations/0002_drop_seed_model.py
Normal file
61
archivebox/crawls/migrations/0002_drop_seed_model.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
# Generated by Django 6.0 on 2025-12-25 09:34
|
||||||
|
|
||||||
|
import archivebox.base_models.models
|
||||||
|
import django.db.models.deletion
|
||||||
|
import pathlib
|
||||||
|
import uuid
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('crawls', '0001_initial'),
|
||||||
|
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='crawl',
|
||||||
|
name='seed',
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='crawl',
|
||||||
|
name='extractor',
|
||||||
|
field=models.CharField(default='auto', help_text='Parser for reading URLs (auto, html, json, rss, etc)', max_length=32),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='crawl',
|
||||||
|
name='created_by',
|
||||||
|
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='crawl',
|
||||||
|
name='id',
|
||||||
|
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='crawl',
|
||||||
|
name='output_dir',
|
||||||
|
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/data/archive')),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='crawl',
|
||||||
|
name='urls',
|
||||||
|
field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='crawlschedule',
|
||||||
|
name='created_by',
|
||||||
|
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='crawlschedule',
|
||||||
|
name='id',
|
||||||
|
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||||
|
),
|
||||||
|
migrations.DeleteModel(
|
||||||
|
name='Seed',
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -20,91 +20,6 @@ if TYPE_CHECKING:
|
|||||||
from core.models import Snapshot, ArchiveResult
|
from core.models import Snapshot, ArchiveResult
|
||||||
|
|
||||||
|
|
||||||
class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats):
|
|
||||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
|
||||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
||||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
|
|
||||||
modified_at = models.DateTimeField(auto_now=True)
|
|
||||||
|
|
||||||
uri = models.URLField(max_length=2048)
|
|
||||||
extractor = models.CharField(default='auto', max_length=32)
|
|
||||||
tags_str = models.CharField(max_length=255, null=False, blank=True, default='')
|
|
||||||
label = models.CharField(max_length=255, null=False, blank=True, default='')
|
|
||||||
config = models.JSONField(default=dict)
|
|
||||||
output_dir = models.FilePathField(path=settings.ARCHIVE_DIR, null=False, blank=True, default='')
|
|
||||||
notes = models.TextField(blank=True, null=False, default='')
|
|
||||||
|
|
||||||
crawl_set: models.Manager['Crawl']
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
verbose_name = 'Seed'
|
|
||||||
verbose_name_plural = 'Seeds'
|
|
||||||
unique_together = (('created_by', 'uri', 'extractor'), ('created_by', 'label'))
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return f'[{self.id}] {self.uri[:64]}'
|
|
||||||
|
|
||||||
def save(self, *args, **kwargs):
|
|
||||||
is_new = self._state.adding
|
|
||||||
super().save(*args, **kwargs)
|
|
||||||
if is_new:
|
|
||||||
from archivebox.misc.logging_util import log_worker_event
|
|
||||||
log_worker_event(
|
|
||||||
worker_type='DB',
|
|
||||||
event='Created Seed',
|
|
||||||
indent_level=0,
|
|
||||||
metadata={
|
|
||||||
'id': str(self.id),
|
|
||||||
'uri': str(self.uri)[:64],
|
|
||||||
'extractor': self.extractor,
|
|
||||||
'label': self.label or None,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_file(cls, source_file: Path, label: str = '', parser: str = 'auto', tag: str = '', created_by=None, config=None):
|
|
||||||
# Use absolute path for file:// URLs so extractors can find the files
|
|
||||||
source_path = str(source_file.resolve())
|
|
||||||
seed, _ = cls.objects.get_or_create(
|
|
||||||
label=label or source_file.name, uri=f'file://{source_path}',
|
|
||||||
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
|
|
||||||
extractor=parser, tags_str=tag, config=config or {},
|
|
||||||
)
|
|
||||||
return seed
|
|
||||||
|
|
||||||
@property
|
|
||||||
def source_type(self):
|
|
||||||
return self.uri.split('://', 1)[0].lower()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def api_url(self) -> str:
|
|
||||||
return reverse_lazy('api-1:get_seed', args=[self.id])
|
|
||||||
|
|
||||||
def get_file_path(self) -> Path | None:
|
|
||||||
"""
|
|
||||||
Get the filesystem path for file:// URIs.
|
|
||||||
Handles both old format (file:///data/...) and new format (file:///absolute/path).
|
|
||||||
Returns None if URI is not a file:// URI.
|
|
||||||
"""
|
|
||||||
if not self.uri.startswith('file://'):
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Remove file:// prefix
|
|
||||||
path_str = self.uri.replace('file://', '', 1)
|
|
||||||
|
|
||||||
# Handle old format: file:///data/... -> DATA_DIR/...
|
|
||||||
if path_str.startswith('/data/'):
|
|
||||||
return CONSTANTS.DATA_DIR / path_str.replace('/data/', '', 1)
|
|
||||||
|
|
||||||
# Handle new format: file:///absolute/path
|
|
||||||
return Path(path_str)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def snapshot_set(self) -> QuerySet['Snapshot']:
|
|
||||||
from core.models import Snapshot
|
|
||||||
return Snapshot.objects.filter(crawl_id__in=self.crawl_set.values_list('pk', flat=True))
|
|
||||||
|
|
||||||
|
|
||||||
class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
|
class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
|
||||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||||
@@ -124,14 +39,15 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
|
|||||||
verbose_name_plural = 'Scheduled Crawls'
|
verbose_name_plural = 'Scheduled Crawls'
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return f'[{self.id}] {self.template.seed.uri[:64] if self.template and self.template.seed else ""} @ {self.schedule}'
|
urls_preview = self.template.urls[:64] if self.template and self.template.urls else ""
|
||||||
|
return f'[{self.id}] {urls_preview} @ {self.schedule}'
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def api_url(self) -> str:
|
def api_url(self) -> str:
|
||||||
return reverse_lazy('api-1:get_any', args=[self.id])
|
return reverse_lazy('api-1:get_any', args=[self.id])
|
||||||
|
|
||||||
def save(self, *args, **kwargs):
|
def save(self, *args, **kwargs):
|
||||||
self.label = self.label or (self.template.seed.label if self.template and self.template.seed else '')
|
self.label = self.label or (self.template.label if self.template else '')
|
||||||
super().save(*args, **kwargs)
|
super().save(*args, **kwargs)
|
||||||
if self.template:
|
if self.template:
|
||||||
self.template.schedule = self
|
self.template.schedule = self
|
||||||
@@ -144,8 +60,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
|
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
|
||||||
modified_at = models.DateTimeField(auto_now=True)
|
modified_at = models.DateTimeField(auto_now=True)
|
||||||
|
|
||||||
seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
|
urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl')
|
||||||
urls = models.TextField(blank=True, null=False, default='')
|
extractor = models.CharField(default='auto', max_length=32, help_text='Parser for reading URLs (auto, html, json, rss, etc)')
|
||||||
config = models.JSONField(default=dict)
|
config = models.JSONField(default=dict)
|
||||||
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
|
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
|
||||||
tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
|
tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
|
||||||
@@ -171,31 +87,40 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
verbose_name_plural = 'Crawls'
|
verbose_name_plural = 'Crawls'
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f'[{self.id}] {self.seed.uri[:64] if self.seed else ""}'
|
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
|
||||||
|
return f'[{self.id}] {first_url[:64]}'
|
||||||
|
|
||||||
def save(self, *args, **kwargs):
|
def save(self, *args, **kwargs):
|
||||||
is_new = self._state.adding
|
is_new = self._state.adding
|
||||||
super().save(*args, **kwargs)
|
super().save(*args, **kwargs)
|
||||||
if is_new:
|
if is_new:
|
||||||
from archivebox.misc.logging_util import log_worker_event
|
from archivebox.misc.logging_util import log_worker_event
|
||||||
|
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
|
||||||
log_worker_event(
|
log_worker_event(
|
||||||
worker_type='DB',
|
worker_type='DB',
|
||||||
event='Created Crawl',
|
event='Created Crawl',
|
||||||
indent_level=1,
|
indent_level=1,
|
||||||
metadata={
|
metadata={
|
||||||
'id': str(self.id),
|
'id': str(self.id),
|
||||||
'seed_uri': str(self.seed.uri)[:64] if self.seed else None,
|
'first_url': first_url[:64],
|
||||||
'max_depth': self.max_depth,
|
'max_depth': self.max_depth,
|
||||||
'status': self.status,
|
'status': self.status,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_seed(cls, seed: Seed, max_depth: int = 0, persona: str = 'Default', tags_str: str = '', config=None, created_by=None):
|
def from_file(cls, source_file: Path, max_depth: int = 0, label: str = '', extractor: str = 'auto',
|
||||||
crawl, _ = cls.objects.get_or_create(
|
tags_str: str = '', config=None, created_by=None):
|
||||||
seed=seed, max_depth=max_depth, tags_str=tags_str or seed.tags_str,
|
"""Create a crawl from a file containing URLs."""
|
||||||
config=seed.config or config or {},
|
urls_content = source_file.read_text()
|
||||||
created_by_id=getattr(created_by, 'pk', created_by) or seed.created_by_id,
|
crawl = cls.objects.create(
|
||||||
|
urls=urls_content,
|
||||||
|
extractor=extractor,
|
||||||
|
max_depth=max_depth,
|
||||||
|
tags_str=tags_str,
|
||||||
|
label=label or source_file.name,
|
||||||
|
config=config or {},
|
||||||
|
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
|
||||||
)
|
)
|
||||||
return crawl
|
return crawl
|
||||||
|
|
||||||
@@ -203,14 +128,47 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
def api_url(self) -> str:
|
def api_url(self) -> str:
|
||||||
return reverse_lazy('api-1:get_crawl', args=[self.id])
|
return reverse_lazy('api-1:get_crawl', args=[self.id])
|
||||||
|
|
||||||
|
def get_urls_list(self) -> list[str]:
|
||||||
|
"""Get list of URLs from urls field, filtering out comments and empty lines."""
|
||||||
|
if not self.urls:
|
||||||
|
return []
|
||||||
|
return [
|
||||||
|
url.strip()
|
||||||
|
for url in self.urls.split('\n')
|
||||||
|
if url.strip() and not url.strip().startswith('#')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_file_path(self) -> Path | None:
|
||||||
|
"""
|
||||||
|
Get filesystem path if this crawl references a local file.
|
||||||
|
Checks if the first URL is a file:// URI.
|
||||||
|
"""
|
||||||
|
urls = self.get_urls_list()
|
||||||
|
if not urls:
|
||||||
|
return None
|
||||||
|
|
||||||
|
first_url = urls[0]
|
||||||
|
if not first_url.startswith('file://'):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Remove file:// prefix
|
||||||
|
path_str = first_url.replace('file://', '', 1)
|
||||||
|
return Path(path_str)
|
||||||
|
|
||||||
def create_root_snapshot(self) -> 'Snapshot':
|
def create_root_snapshot(self) -> 'Snapshot':
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
|
|
||||||
|
first_url = self.get_urls_list()[0] if self.get_urls_list() else None
|
||||||
|
if not first_url:
|
||||||
|
raise ValueError(f'Crawl {self.id} has no URLs to create root snapshot from')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return Snapshot.objects.get(crawl=self, url=self.seed.uri)
|
return Snapshot.objects.get(crawl=self, url=first_url)
|
||||||
except Snapshot.DoesNotExist:
|
except Snapshot.DoesNotExist:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
root_snapshot, _ = Snapshot.objects.update_or_create(
|
root_snapshot, _ = Snapshot.objects.update_or_create(
|
||||||
crawl=self, url=self.seed.uri,
|
crawl=self, url=first_url,
|
||||||
defaults={
|
defaults={
|
||||||
'status': Snapshot.INITIAL_STATE,
|
'status': Snapshot.INITIAL_STATE,
|
||||||
'retry_at': timezone.now(),
|
'retry_at': timezone.now(),
|
||||||
|
|||||||
@@ -42,11 +42,12 @@ class CrawlMachine(StateMachine, strict_states=True):
|
|||||||
return self.__repr__()
|
return self.__repr__()
|
||||||
|
|
||||||
def can_start(self) -> bool:
|
def can_start(self) -> bool:
|
||||||
if not self.crawl.seed:
|
if not self.crawl.urls:
|
||||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no seed[/red]')
|
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
|
||||||
return False
|
return False
|
||||||
if not self.crawl.seed.uri:
|
urls_list = self.crawl.get_urls_list()
|
||||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: seed has no URI[/red]')
|
if not urls_list:
|
||||||
|
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -121,13 +122,14 @@ class CrawlMachine(StateMachine, strict_states=True):
|
|||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Run all on_Crawl hooks
|
# Run all on_Crawl hooks
|
||||||
|
first_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ''
|
||||||
results = run_hooks(
|
results = run_hooks(
|
||||||
event_name='Crawl',
|
event_name='Crawl',
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
timeout=60,
|
timeout=60,
|
||||||
config_objects=[self.crawl, self.crawl.seed] if self.crawl.seed else [self.crawl],
|
config_objects=[self.crawl],
|
||||||
crawl_id=str(self.crawl.id),
|
crawl_id=str(self.crawl.id),
|
||||||
seed_uri=self.crawl.seed.uri if self.crawl.seed else '',
|
seed_uri=first_url,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Process hook results - parse JSONL output and create DB objects
|
# Process hook results - parse JSONL output and create DB objects
|
||||||
|
|||||||
@@ -1,2 +0,0 @@
|
|||||||
|
|
||||||
> /Users/squash/Local/Code/archiveboxes/archivebox-nue/archivebox/cli/archivebox_init.py --force; TS=2025-12-25__08:03:12 VERSION=0.9.0rc1 IN_DOCKER=False IS_TTY=False
|
|
||||||
@@ -0,0 +1,65 @@
|
|||||||
|
# Generated by Django 6.0 on 2025-12-25 09:34
|
||||||
|
|
||||||
|
import django.db.models.deletion
|
||||||
|
import uuid
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('machine', '0001_squashed'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='dependency',
|
||||||
|
name='bin_name',
|
||||||
|
field=models.CharField(db_index=True, help_text='Binary executable name (e.g., wget, yt-dlp, chromium)', max_length=63, unique=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='dependency',
|
||||||
|
name='bin_providers',
|
||||||
|
field=models.CharField(default='*', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any', max_length=127),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='dependency',
|
||||||
|
name='config',
|
||||||
|
field=models.JSONField(blank=True, default=dict, help_text='JSON map of env var config to use during install'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='dependency',
|
||||||
|
name='custom_cmds',
|
||||||
|
field=models.JSONField(blank=True, default=dict, help_text="JSON map of provider -> custom install command (e.g., {'apt': 'apt install -y wget'})"),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='dependency',
|
||||||
|
name='id',
|
||||||
|
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='installedbinary',
|
||||||
|
name='dependency',
|
||||||
|
field=models.ForeignKey(blank=True, help_text='The Dependency this binary satisfies', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='installedbinary_set', to='machine.dependency'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='installedbinary',
|
||||||
|
name='id',
|
||||||
|
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='machine',
|
||||||
|
name='config',
|
||||||
|
field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='machine',
|
||||||
|
name='id',
|
||||||
|
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='networkinterface',
|
||||||
|
name='id',
|
||||||
|
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -27,10 +27,9 @@ TYPE_SNAPSHOT = 'Snapshot'
|
|||||||
TYPE_ARCHIVERESULT = 'ArchiveResult'
|
TYPE_ARCHIVERESULT = 'ArchiveResult'
|
||||||
TYPE_TAG = 'Tag'
|
TYPE_TAG = 'Tag'
|
||||||
TYPE_CRAWL = 'Crawl'
|
TYPE_CRAWL = 'Crawl'
|
||||||
TYPE_SEED = 'Seed'
|
|
||||||
TYPE_INSTALLEDBINARY = 'InstalledBinary'
|
TYPE_INSTALLEDBINARY = 'InstalledBinary'
|
||||||
|
|
||||||
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_SEED, TYPE_INSTALLEDBINARY}
|
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_INSTALLEDBINARY}
|
||||||
|
|
||||||
|
|
||||||
def parse_line(line: str) -> Optional[Dict[str, Any]]:
|
def parse_line(line: str) -> Optional[Dict[str, Any]]:
|
||||||
@@ -206,7 +205,8 @@ def crawl_to_jsonl(crawl) -> Dict[str, Any]:
|
|||||||
return {
|
return {
|
||||||
'type': TYPE_CRAWL,
|
'type': TYPE_CRAWL,
|
||||||
'id': str(crawl.id),
|
'id': str(crawl.id),
|
||||||
'seed_id': str(crawl.seed_id),
|
'urls': crawl.urls,
|
||||||
|
'extractor': crawl.extractor,
|
||||||
'status': crawl.status,
|
'status': crawl.status,
|
||||||
'max_depth': crawl.max_depth,
|
'max_depth': crawl.max_depth,
|
||||||
'created_at': crawl.created_at.isoformat() if crawl.created_at else None,
|
'created_at': crawl.created_at.isoformat() if crawl.created_at else None,
|
||||||
|
|||||||
@@ -13,9 +13,11 @@ from rich.console import Console
|
|||||||
from rich.highlighter import Highlighter
|
from rich.highlighter import Highlighter
|
||||||
|
|
||||||
# SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
|
# SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
|
||||||
CONSOLE = Console()
|
# Disable wrapping - use soft_wrap=True and large width so text flows naturally
|
||||||
STDERR = Console(stderr=True)
|
# Colors are preserved, just no hard line breaks inserted
|
||||||
IS_TTY = CONSOLE.is_interactive
|
CONSOLE = Console(width=32768, soft_wrap=True, force_terminal=True)
|
||||||
|
STDERR = Console(stderr=True, width=32768, soft_wrap=True, force_terminal=True)
|
||||||
|
IS_TTY = sys.stdout.isatty()
|
||||||
|
|
||||||
class RainbowHighlighter(Highlighter):
|
class RainbowHighlighter(Highlighter):
|
||||||
def highlight(self, text):
|
def highlight(self, text):
|
||||||
|
|||||||
@@ -603,21 +603,17 @@ def log_worker_event(
|
|||||||
|
|
||||||
# Build final message
|
# Build final message
|
||||||
error_str = f' {type(error).__name__}: {error}' if error else ''
|
error_str = f' {type(error).__name__}: {error}' if error else ''
|
||||||
# Build colored message - worker_label needs to be inside color tags
|
|
||||||
# But first we need to format the color tags separately from the worker label
|
|
||||||
from archivebox.misc.logging import CONSOLE
|
from archivebox.misc.logging import CONSOLE
|
||||||
from rich.text import Text
|
from rich.text import Text
|
||||||
|
|
||||||
# Create a Rich Text object for proper formatting
|
# Create a Rich Text object for proper formatting
|
||||||
text = Text()
|
text = Text()
|
||||||
text.append(indent) # Indentation
|
text.append(indent)
|
||||||
# Append worker label and event with color
|
|
||||||
text.append(f'{worker_label} {event}{error_str}', style=color)
|
text.append(f'{worker_label} {event}{error_str}', style=color)
|
||||||
# Append metadata without color (add separator if metadata exists)
|
|
||||||
if metadata_str:
|
if metadata_str:
|
||||||
text.append(f' | {metadata_str}')
|
text.append(f' | {metadata_str}')
|
||||||
|
|
||||||
CONSOLE.print(text)
|
CONSOLE.print(text, soft_wrap=True)
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
|||||||
@@ -1,7 +1,5 @@
|
|||||||
__package__ = 'archivebox'
|
__package__ = 'archivebox'
|
||||||
|
|
||||||
import sys
|
|
||||||
import shutil
|
|
||||||
import django
|
import django
|
||||||
import pydantic
|
import pydantic
|
||||||
|
|
||||||
@@ -20,14 +18,10 @@ timezone.utc = datetime.timezone.utc
|
|||||||
# DjangoSignalWebhooksConfig.verbose_name = 'API'
|
# DjangoSignalWebhooksConfig.verbose_name = 'API'
|
||||||
|
|
||||||
|
|
||||||
# Install rich for pretty tracebacks in console logs
|
# Rich traceback handler disabled - it adds frames/boxes that wrap weirdly in log files
|
||||||
# https://rich.readthedocs.io/en/stable/traceback.html#traceback-handler
|
# Standard Python tracebacks are used instead (full width, no frames)
|
||||||
|
# from rich.traceback import install
|
||||||
from rich.traceback import install # noqa
|
# install(show_locals=True, word_wrap=False, ...)
|
||||||
|
|
||||||
TERM_WIDTH = (shutil.get_terminal_size((200, 10)).columns - 1) if sys.stdout.isatty() else 200
|
|
||||||
# os.environ.setdefault('COLUMNS', str(TERM_WIDTH))
|
|
||||||
install(show_locals=True, word_wrap=False, locals_max_length=10, locals_hide_dunder=True, suppress=[django, pydantic], extra_lines=2, width=TERM_WIDTH)
|
|
||||||
|
|
||||||
|
|
||||||
# Hide site-packages/sonic/client.py:115: SyntaxWarning
|
# Hide site-packages/sonic/client.py:115: SyntaxWarning
|
||||||
|
|||||||
@@ -552,21 +552,21 @@
|
|||||||
if (crawl.status === 'queued' && !crawl.can_start) {
|
if (crawl.status === 'queued' && !crawl.can_start) {
|
||||||
warningHtml = `
|
warningHtml = `
|
||||||
<div style="padding: 8px 14px; background: rgba(248, 81, 73, 0.1); border-top: 1px solid #f85149; color: #f85149; font-size: 11px;">
|
<div style="padding: 8px 14px; background: rgba(248, 81, 73, 0.1); border-top: 1px solid #f85149; color: #f85149; font-size: 11px;">
|
||||||
⚠️ Crawl cannot start: ${crawl.seed_uri ? 'unknown error' : 'no seed URI'}
|
⚠️ Crawl cannot start: ${crawl.urls_preview ? 'unknown error' : 'no URLs'}
|
||||||
</div>
|
</div>
|
||||||
`;
|
`;
|
||||||
} else if (crawl.status === 'queued' && crawl.retry_at_future) {
|
} else if (crawl.status === 'queued' && crawl.retry_at_future) {
|
||||||
// Queued but retry_at is in future (was claimed by worker, will retry)
|
// Queued but retry_at is in future (was claimed by worker, will retry)
|
||||||
warningHtml = `
|
warningHtml = `
|
||||||
<div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;">
|
<div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;">
|
||||||
🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''}
|
🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
|
||||||
</div>
|
</div>
|
||||||
`;
|
`;
|
||||||
} else if (crawl.status === 'queued' && crawl.total_snapshots === 0) {
|
} else if (crawl.status === 'queued' && crawl.total_snapshots === 0) {
|
||||||
// Queued and waiting to be picked up by worker
|
// Queued and waiting to be picked up by worker
|
||||||
warningHtml = `
|
warningHtml = `
|
||||||
<div style="padding: 8px 14px; background: rgba(210, 153, 34, 0.1); border-top: 1px solid #d29922; color: #d29922; font-size: 11px;">
|
<div style="padding: 8px 14px; background: rgba(210, 153, 34, 0.1); border-top: 1px solid #d29922; color: #d29922; font-size: 11px;">
|
||||||
⏳ Waiting for worker to pick up...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''}
|
⏳ Waiting for worker to pick up...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
|
||||||
</div>
|
</div>
|
||||||
`;
|
`;
|
||||||
}
|
}
|
||||||
@@ -577,8 +577,8 @@
|
|||||||
metaText += ` | ${crawl.total_snapshots} snapshots`;
|
metaText += ` | ${crawl.total_snapshots} snapshots`;
|
||||||
} else if (crawl.urls_count > 0) {
|
} else if (crawl.urls_count > 0) {
|
||||||
metaText += ` | ${crawl.urls_count} URLs`;
|
metaText += ` | ${crawl.urls_count} URLs`;
|
||||||
} else if (crawl.seed_uri) {
|
} else if (crawl.urls_preview) {
|
||||||
metaText += ` | ${crawl.seed_uri.substring(0, 40)}${crawl.seed_uri.length > 40 ? '...' : ''}`;
|
metaText += ` | ${crawl.urls_preview.substring(0, 40)}${crawl.urls_preview.length > 40 ? '...' : ''}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
return `
|
return `
|
||||||
|
|||||||
@@ -26,6 +26,9 @@ CONFIG_FILE_NAME = "supervisord.conf"
|
|||||||
PID_FILE_NAME = "supervisord.pid"
|
PID_FILE_NAME = "supervisord.pid"
|
||||||
WORKERS_DIR_NAME = "workers"
|
WORKERS_DIR_NAME = "workers"
|
||||||
|
|
||||||
|
# Global reference to supervisord process for cleanup
|
||||||
|
_supervisord_proc = None
|
||||||
|
|
||||||
ORCHESTRATOR_WORKER = {
|
ORCHESTRATOR_WORKER = {
|
||||||
"name": "worker_orchestrator",
|
"name": "worker_orchestrator",
|
||||||
"command": "archivebox manage orchestrator", # runs forever by default
|
"command": "archivebox manage orchestrator", # runs forever by default
|
||||||
@@ -78,7 +81,7 @@ def create_supervisord_config():
|
|||||||
config_content = f"""
|
config_content = f"""
|
||||||
[supervisord]
|
[supervisord]
|
||||||
nodaemon = true
|
nodaemon = true
|
||||||
environment = IS_SUPERVISORD_PARENT="true"
|
environment = IS_SUPERVISORD_PARENT="true",COLUMNS="200"
|
||||||
pidfile = {PID_FILE}
|
pidfile = {PID_FILE}
|
||||||
logfile = {LOG_FILE}
|
logfile = {LOG_FILE}
|
||||||
childlogdir = {CONSTANTS.LOGS_DIR}
|
childlogdir = {CONSTANTS.LOGS_DIR}
|
||||||
@@ -143,11 +146,27 @@ def get_existing_supervisord_process():
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def stop_existing_supervisord_process():
|
def stop_existing_supervisord_process():
|
||||||
|
global _supervisord_proc
|
||||||
SOCK_FILE = get_sock_file()
|
SOCK_FILE = get_sock_file()
|
||||||
PID_FILE = SOCK_FILE.parent / PID_FILE_NAME
|
PID_FILE = SOCK_FILE.parent / PID_FILE_NAME
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# if pid file exists, load PID int
|
# First try to stop via the global proc reference
|
||||||
|
if _supervisord_proc and _supervisord_proc.poll() is None:
|
||||||
|
try:
|
||||||
|
print(f"[🦸♂️] Stopping supervisord process (pid={_supervisord_proc.pid})...")
|
||||||
|
_supervisord_proc.terminate()
|
||||||
|
try:
|
||||||
|
_supervisord_proc.wait(timeout=5)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
_supervisord_proc.kill()
|
||||||
|
_supervisord_proc.wait(timeout=2)
|
||||||
|
except (BaseException, BrokenPipeError, IOError, KeyboardInterrupt):
|
||||||
|
pass
|
||||||
|
_supervisord_proc = None
|
||||||
|
return
|
||||||
|
|
||||||
|
# Fallback: if pid file exists, load PID int and kill that process
|
||||||
try:
|
try:
|
||||||
pid = int(PID_FILE.read_text())
|
pid = int(PID_FILE.read_text())
|
||||||
except (FileNotFoundError, ValueError):
|
except (FileNotFoundError, ValueError):
|
||||||
@@ -156,8 +175,25 @@ def stop_existing_supervisord_process():
|
|||||||
try:
|
try:
|
||||||
print(f"[🦸♂️] Stopping supervisord process (pid={pid})...")
|
print(f"[🦸♂️] Stopping supervisord process (pid={pid})...")
|
||||||
proc = psutil.Process(pid)
|
proc = psutil.Process(pid)
|
||||||
|
# Kill the entire process group to ensure all children are stopped
|
||||||
|
children = proc.children(recursive=True)
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
|
# Also terminate all children
|
||||||
|
for child in children:
|
||||||
|
try:
|
||||||
|
child.terminate()
|
||||||
|
except psutil.NoSuchProcess:
|
||||||
|
pass
|
||||||
proc.wait(timeout=5)
|
proc.wait(timeout=5)
|
||||||
|
# Kill any remaining children
|
||||||
|
for child in children:
|
||||||
|
try:
|
||||||
|
if child.is_running():
|
||||||
|
child.kill()
|
||||||
|
except psutil.NoSuchProcess:
|
||||||
|
pass
|
||||||
|
except psutil.NoSuchProcess:
|
||||||
|
pass
|
||||||
except (BaseException, BrokenPipeError, IOError, KeyboardInterrupt):
|
except (BaseException, BrokenPipeError, IOError, KeyboardInterrupt):
|
||||||
pass
|
pass
|
||||||
finally:
|
finally:
|
||||||
@@ -192,40 +228,44 @@ def start_new_supervisord_process(daemonize=False):
|
|||||||
# create the supervisord config file
|
# create the supervisord config file
|
||||||
create_supervisord_config()
|
create_supervisord_config()
|
||||||
|
|
||||||
# Start supervisord
|
# Open log file for supervisord output
|
||||||
# panel = Panel(f"Starting supervisord with config: {SUPERVISORD_CONFIG_FILE}")
|
LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||||
# with Live(panel, refresh_per_second=1) as live:
|
log_handle = open(LOG_FILE, 'a')
|
||||||
|
|
||||||
subprocess.Popen(
|
if daemonize:
|
||||||
f"supervisord --configuration={CONFIG_FILE}",
|
# Start supervisord in background (daemon mode)
|
||||||
stdin=None,
|
subprocess.Popen(
|
||||||
shell=True,
|
f"supervisord --configuration={CONFIG_FILE}",
|
||||||
start_new_session=daemonize,
|
stdin=None,
|
||||||
)
|
stdout=log_handle,
|
||||||
|
stderr=log_handle,
|
||||||
|
shell=True,
|
||||||
|
start_new_session=True,
|
||||||
|
)
|
||||||
|
time.sleep(2)
|
||||||
|
return get_existing_supervisord_process()
|
||||||
|
else:
|
||||||
|
# Start supervisord in FOREGROUND - this will block until supervisord exits
|
||||||
|
# supervisord with nodaemon=true will run in foreground and handle signals properly
|
||||||
|
# When supervisord gets SIGINT/SIGTERM, it will stop all child processes before exiting
|
||||||
|
proc = subprocess.Popen(
|
||||||
|
f"supervisord --configuration={CONFIG_FILE}",
|
||||||
|
stdin=None,
|
||||||
|
stdout=log_handle,
|
||||||
|
stderr=log_handle,
|
||||||
|
shell=True,
|
||||||
|
start_new_session=False, # Keep in same process group so signals propagate
|
||||||
|
)
|
||||||
|
|
||||||
def exit_signal_handler(signum, frame):
|
# Store the process so we can wait on it later
|
||||||
if signum == 2:
|
global _supervisord_proc
|
||||||
STDERR.print("\n[🛑] Got Ctrl+C. Terminating child processes...")
|
_supervisord_proc = proc
|
||||||
elif signum != 13:
|
|
||||||
STDERR.print(f"\n[🦸♂️] Supervisord got stop signal ({signal.strsignal(signum)}). Terminating child processes...")
|
|
||||||
stop_existing_supervisord_process()
|
|
||||||
raise SystemExit(0)
|
|
||||||
|
|
||||||
# Monitor for termination signals and cleanup child processes
|
# Wait a bit for supervisord to start up
|
||||||
if not daemonize:
|
time.sleep(2)
|
||||||
try:
|
|
||||||
signal.signal(signal.SIGINT, exit_signal_handler)
|
|
||||||
signal.signal(signal.SIGHUP, exit_signal_handler)
|
|
||||||
signal.signal(signal.SIGPIPE, exit_signal_handler)
|
|
||||||
signal.signal(signal.SIGTERM, exit_signal_handler)
|
|
||||||
except Exception:
|
|
||||||
# signal handlers only work in main thread
|
|
||||||
pass
|
|
||||||
# otherwise supervisord will containue in background even if parent proc is ends (aka daemon mode)
|
|
||||||
|
|
||||||
time.sleep(2)
|
return get_existing_supervisord_process()
|
||||||
|
|
||||||
return get_existing_supervisord_process()
|
|
||||||
|
|
||||||
def get_or_create_supervisord_process(daemonize=False):
|
def get_or_create_supervisord_process(daemonize=False):
|
||||||
SOCK_FILE = get_sock_file()
|
SOCK_FILE = get_sock_file()
|
||||||
@@ -353,9 +393,15 @@ def tail_worker_logs(log_path: str):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def tail_multiple_worker_logs(log_files: list[str], follow=True):
|
def tail_multiple_worker_logs(log_files: list[str], follow=True, proc=None):
|
||||||
"""Tail multiple log files simultaneously, interleaving their output."""
|
"""Tail multiple log files simultaneously, interleaving their output.
|
||||||
import select
|
|
||||||
|
Args:
|
||||||
|
log_files: List of log file paths to tail
|
||||||
|
follow: Whether to keep following (True) or just read existing content (False)
|
||||||
|
proc: Optional subprocess.Popen object - stop tailing when this process exits
|
||||||
|
"""
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# Convert relative paths to absolute paths
|
# Convert relative paths to absolute paths
|
||||||
@@ -377,48 +423,53 @@ def tail_multiple_worker_logs(log_files: list[str], follow=True):
|
|||||||
for log_path in log_paths:
|
for log_path in log_paths:
|
||||||
try:
|
try:
|
||||||
f = open(log_path, 'r')
|
f = open(log_path, 'r')
|
||||||
# Seek to end of file if following
|
# Don't seek to end - show recent content so user sees something
|
||||||
if follow:
|
# Go to end minus 4KB to show some recent logs
|
||||||
f.seek(0, 2) # Seek to end
|
f.seek(0, 2) # Go to end first
|
||||||
file_handles.append((log_path.name, f))
|
file_size = f.tell()
|
||||||
|
if file_size > 4096:
|
||||||
|
f.seek(file_size - 4096)
|
||||||
|
f.readline() # Skip partial line
|
||||||
|
else:
|
||||||
|
f.seek(0) # Small file, read from start
|
||||||
|
|
||||||
|
file_handles.append((log_path, f))
|
||||||
|
print(f" [tailing {log_path.name}]")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[yellow]Warning: Could not open {log_path}: {e}[/yellow]")
|
sys.stderr.write(f"Warning: Could not open {log_path}: {e}\n")
|
||||||
|
|
||||||
if not file_handles:
|
if not file_handles:
|
||||||
print("[red]No log files could be opened[/red]")
|
sys.stderr.write("No log files could be opened\n")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Print which logs we're tailing
|
|
||||||
log_names = [name for name, _ in file_handles]
|
|
||||||
print(f"[dim]Tailing: {', '.join(log_names)}[/dim]")
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while follow:
|
while follow:
|
||||||
# Read available lines from all files
|
# Check if the monitored process has exited
|
||||||
for log_name, f in file_handles:
|
if proc is not None and proc.poll() is not None:
|
||||||
line = f.readline()
|
print(f"\n[server process exited with code {proc.returncode}]")
|
||||||
if line:
|
break
|
||||||
# Colorize based on log source
|
|
||||||
if 'orchestrator' in log_name.lower():
|
|
||||||
color = 'cyan'
|
|
||||||
elif 'daphne' in log_name.lower():
|
|
||||||
color = 'green'
|
|
||||||
else:
|
|
||||||
color = 'white'
|
|
||||||
|
|
||||||
|
had_output = False
|
||||||
|
# Read ALL available lines from all files (not just one per iteration)
|
||||||
|
for log_path, f in file_handles:
|
||||||
|
while True:
|
||||||
|
line = f.readline()
|
||||||
|
if not line:
|
||||||
|
break # No more lines available in this file
|
||||||
|
had_output = True
|
||||||
# Strip ANSI codes if present (supervisord does this but just in case)
|
# Strip ANSI codes if present (supervisord does this but just in case)
|
||||||
import re
|
|
||||||
line_clean = re.sub(r'\x1b\[[0-9;]*m', '', line.rstrip())
|
line_clean = re.sub(r'\x1b\[[0-9;]*m', '', line.rstrip())
|
||||||
|
|
||||||
if line_clean:
|
if line_clean:
|
||||||
print(f'[{color}][{log_name}][/{color}] {line_clean}')
|
print(line_clean)
|
||||||
|
|
||||||
# Small sleep to avoid busy-waiting
|
# Small sleep to avoid busy-waiting (only when no output)
|
||||||
time.sleep(0.1)
|
if not had_output:
|
||||||
|
time.sleep(0.05)
|
||||||
|
|
||||||
except (KeyboardInterrupt, BrokenPipeError, IOError):
|
except (KeyboardInterrupt, BrokenPipeError, IOError):
|
||||||
print("\n[yellow][i] Stopped tailing logs[/i][/yellow]")
|
pass # Let the caller handle the cleanup message
|
||||||
except SystemExit:
|
except SystemExit:
|
||||||
pass
|
pass
|
||||||
finally:
|
finally:
|
||||||
@@ -451,6 +502,8 @@ def watch_worker(supervisor, daemon_name, interval=5):
|
|||||||
|
|
||||||
|
|
||||||
def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
|
def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
|
||||||
|
global _supervisord_proc
|
||||||
|
|
||||||
supervisor = get_or_create_supervisord_process(daemonize=daemonize)
|
supervisor = get_or_create_supervisord_process(daemonize=daemonize)
|
||||||
|
|
||||||
bg_workers = [
|
bg_workers = [
|
||||||
@@ -466,36 +519,50 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
|
|||||||
|
|
||||||
if not daemonize:
|
if not daemonize:
|
||||||
try:
|
try:
|
||||||
watch_worker(supervisor, "worker_daphne")
|
# Tail worker logs while supervisord runs
|
||||||
|
sys.stdout.write('Tailing worker logs (Ctrl+C to stop)...\n\n')
|
||||||
|
sys.stdout.flush()
|
||||||
|
tail_multiple_worker_logs(
|
||||||
|
log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
|
||||||
|
follow=True,
|
||||||
|
proc=_supervisord_proc, # Stop tailing when supervisord exits
|
||||||
|
)
|
||||||
except (KeyboardInterrupt, BrokenPipeError, IOError):
|
except (KeyboardInterrupt, BrokenPipeError, IOError):
|
||||||
STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
|
STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
|
||||||
except SystemExit:
|
except SystemExit:
|
||||||
pass
|
pass
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...")
|
STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping gracefully...")
|
||||||
raise
|
|
||||||
finally:
|
finally:
|
||||||
stop_worker(supervisor, "worker_daphne")
|
# Ensure supervisord and all children are stopped
|
||||||
|
stop_existing_supervisord_process()
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
|
||||||
def start_cli_workers(watch=False):
|
def start_cli_workers(watch=False):
|
||||||
|
global _supervisord_proc
|
||||||
|
|
||||||
supervisor = get_or_create_supervisord_process(daemonize=False)
|
supervisor = get_or_create_supervisord_process(daemonize=False)
|
||||||
|
|
||||||
start_worker(supervisor, ORCHESTRATOR_WORKER)
|
start_worker(supervisor, ORCHESTRATOR_WORKER)
|
||||||
|
|
||||||
if watch:
|
if watch:
|
||||||
try:
|
try:
|
||||||
watch_worker(supervisor, ORCHESTRATOR_WORKER['name'])
|
# Block on supervisord process - it will handle signals and stop children
|
||||||
|
if _supervisord_proc:
|
||||||
|
_supervisord_proc.wait()
|
||||||
|
else:
|
||||||
|
# Fallback to watching worker if no proc reference
|
||||||
|
watch_worker(supervisor, ORCHESTRATOR_WORKER['name'])
|
||||||
except (KeyboardInterrupt, BrokenPipeError, IOError):
|
except (KeyboardInterrupt, BrokenPipeError, IOError):
|
||||||
STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
|
STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
|
||||||
except SystemExit:
|
except SystemExit:
|
||||||
pass
|
pass
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping orchestrator gracefully...")
|
STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping gracefully...")
|
||||||
raise
|
|
||||||
finally:
|
finally:
|
||||||
stop_worker(supervisor, ORCHESTRATOR_WORKER['name'])
|
# Ensure supervisord and all children are stopped
|
||||||
|
stop_existing_supervisord_process()
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
return [ORCHESTRATOR_WORKER]
|
return [ORCHESTRATOR_WORKER]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user