remove Seed model in favor of Crawl as template

This commit is contained in:
Nick Sweeting
2025-12-25 01:52:38 -08:00
parent 28e6c5bb65
commit bb53228ebf
30 changed files with 785 additions and 690 deletions

View File

@@ -0,0 +1,113 @@
# Generated by Django 6.0 on 2025-12-25 09:34
import django.utils.timezone
import signal_webhooks.fields
import signal_webhooks.utils
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('api', '0001_squashed'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.AlterModelOptions(
name='outboundwebhook',
options={'verbose_name': 'API Outbound Webhook'},
),
migrations.AddField(
model_name='outboundwebhook',
name='created',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now, help_text='When the webhook was created.', verbose_name='created'),
preserve_default=False,
),
migrations.AddField(
model_name='outboundwebhook',
name='updated',
field=models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated'),
),
migrations.AlterField(
model_name='apitoken',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='apitoken',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='outboundwebhook',
name='auth_token',
field=signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token'),
),
migrations.AlterField(
model_name='outboundwebhook',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='outboundwebhook',
name='enabled',
field=models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled'),
),
migrations.AlterField(
model_name='outboundwebhook',
name='endpoint',
field=models.URLField(help_text='Target endpoint for this webhook.', max_length=2047, verbose_name='endpoint'),
),
migrations.AlterField(
model_name='outboundwebhook',
name='headers',
field=models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers'),
),
migrations.AlterField(
model_name='outboundwebhook',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='outboundwebhook',
name='keep_last_response',
field=models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response'),
),
migrations.AlterField(
model_name='outboundwebhook',
name='last_failure',
field=models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure'),
),
migrations.AlterField(
model_name='outboundwebhook',
name='last_response',
field=models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response'),
),
migrations.AlterField(
model_name='outboundwebhook',
name='last_success',
field=models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success'),
),
migrations.AlterField(
model_name='outboundwebhook',
name='name',
field=models.CharField(db_index=True, help_text='Webhook name.', max_length=255, unique=True, verbose_name='name'),
),
migrations.AlterField(
model_name='outboundwebhook',
name='ref',
field=models.CharField(db_index=True, help_text='Dot import notation to the model the webhook is for.', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model'),
),
migrations.AlterField(
model_name='outboundwebhook',
name='signal',
field=models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='Signal the webhook fires to.', max_length=255, verbose_name='signal'),
),
migrations.AddConstraint(
model_name='outboundwebhook',
constraint=models.UniqueConstraint(fields=('ref', 'endpoint'), name='prevent_duplicate_hooks_api_outboundwebhook'),
),
]

View File

@@ -15,7 +15,7 @@ from ninja.pagination import paginate, PaginationBase
from ninja.errors import HttpError from ninja.errors import HttpError
from core.models import Snapshot, ArchiveResult, Tag from core.models import Snapshot, ArchiveResult, Tag
from api.v1_crawls import CrawlSchema, SeedSchema from api.v1_crawls import CrawlSchema
router = Router(tags=['Core Models']) router = Router(tags=['Core Models'])
@@ -271,9 +271,9 @@ def get_tag(request, tag_id: str, with_snapshots: bool = True):
return Tag.objects.get(slug__icontains=tag_id) return Tag.objects.get(slug__icontains=tag_id)
@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID") @router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
def get_any(request, id: str): def get_any(request, id: str):
"""Get any object by its ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.).""" """Get any object by its ID (e.g. snapshot, archiveresult, tag, crawl, etc.)."""
request.with_snapshots = False request.with_snapshots = False
request.with_archiveresults = False request.with_archiveresults = False
@@ -285,14 +285,6 @@ def get_any(request, id: str):
except Exception: except Exception:
pass pass
try:
from api.v1_crawls import get_seed
response = get_seed(request, id)
if response:
return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.id}?{request.META['QUERY_STRING']}")
except Exception:
pass
try: try:
from api.v1_crawls import get_crawl from api.v1_crawls import get_crawl
response = get_crawl(request, id) response = get_crawl(request, id)

View File

@@ -10,53 +10,13 @@ from django.contrib.auth import get_user_model
from ninja import Router, Schema from ninja import Router, Schema
from core.models import Snapshot from core.models import Snapshot
from crawls.models import Seed, Crawl from crawls.models import Crawl
from .auth import API_AUTH_METHODS from .auth import API_AUTH_METHODS
router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS) router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
class SeedSchema(Schema):
TYPE: str = 'crawls.models.Seed'
id: UUID
modified_at: datetime
created_at: datetime
created_by_id: str
created_by_username: str
uri: str
tags_str: str
config: dict
@staticmethod
def resolve_created_by_id(obj):
return str(obj.created_by_id)
@staticmethod
def resolve_created_by_username(obj):
User = get_user_model()
return User.objects.get(id=obj.created_by_id).username
@router.get("/seeds", response=List[SeedSchema], url_name="get_seeds")
def get_seeds(request):
return Seed.objects.all().distinct()
@router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
def get_seed(request, seed_id: str):
seed = None
request.with_snapshots = False
request.with_archiveresults = False
try:
seed = Seed.objects.get(Q(id__icontains=seed_id))
except Exception:
pass
return seed
class CrawlSchema(Schema): class CrawlSchema(Schema):
TYPE: str = 'crawls.models.Crawl' TYPE: str = 'crawls.models.Crawl'
@@ -70,8 +30,11 @@ class CrawlSchema(Schema):
status: str status: str
retry_at: datetime | None retry_at: datetime | None
seed: SeedSchema urls: str
extractor: str
max_depth: int max_depth: int
tags_str: str
config: dict
# snapshots: List[SnapshotSchema] # snapshots: List[SnapshotSchema]

View File

@@ -21,6 +21,7 @@ class ArchiveBoxGroup(click.Group):
meta_commands = { meta_commands = {
'help': 'archivebox.cli.archivebox_help.main', 'help': 'archivebox.cli.archivebox_help.main',
'version': 'archivebox.cli.archivebox_version.main', 'version': 'archivebox.cli.archivebox_version.main',
'mcp': 'archivebox.cli.archivebox_mcp.main',
} }
setup_commands = { setup_commands = {
'init': 'archivebox.cli.archivebox_init.main', 'init': 'archivebox.cli.archivebox_init.main',

View File

@@ -36,15 +36,14 @@ def add(urls: str | list[str],
created_by_id: int | None=None) -> QuerySet['Snapshot']: created_by_id: int | None=None) -> QuerySet['Snapshot']:
"""Add a new URL or list of URLs to your archive. """Add a new URL or list of URLs to your archive.
The new flow is: The flow is:
1. Save URLs to sources file 1. Save URLs to sources file
2. Create Seed pointing to the file 2. Create Crawl with URLs and max_depth
3. Create Crawl with max_depth 3. Orchestrator creates Snapshots from Crawl URLs (depth=0)
4. Create root Snapshot pointing to file:// URL (depth=0) 4. Orchestrator runs parser extractors on root snapshots
5. Orchestrator runs parser extractors on root snapshot 5. Parser extractors output to urls.jsonl
6. Parser extractors output to urls.jsonl 6. URLs are added to Crawl.urls and child Snapshots are created
7. URLs are added to Crawl.urls and child Snapshots are created 7. Repeat until max_depth is reached
8. Repeat until max_depth is reached
""" """
from rich import print from rich import print
@@ -55,7 +54,7 @@ def add(urls: str | list[str],
# import models once django is set up # import models once django is set up
from core.models import Snapshot from core.models import Snapshot
from crawls.models import Seed, Crawl from crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk
from workers.orchestrator import Orchestrator from workers.orchestrator import Orchestrator
@@ -66,19 +65,24 @@ def add(urls: str | list[str],
sources_file.parent.mkdir(parents=True, exist_ok=True) sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls)) sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. Create a new Seed pointing to the sources file # 2. Create a new Crawl with inline URLs
cli_args = [*sys.argv] cli_args = [*sys.argv]
if cli_args[0].lower().endswith('archivebox'): if cli_args[0].lower().endswith('archivebox'):
cli_args[0] = 'archivebox' cli_args[0] = 'archivebox'
cmd_str = ' '.join(cli_args) cmd_str = ' '.join(cli_args)
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
seed = Seed.from_file(
sources_file, # Read URLs directly into crawl
urls_content = sources_file.read_text()
crawl = Crawl.objects.create(
urls=urls_content,
extractor=parser,
max_depth=depth,
tags_str=tag,
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]', label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
parser=parser, created_by_id=created_by_id,
tag=tag,
created_by=created_by_id,
config={ config={
'ONLY_NEW': not update, 'ONLY_NEW': not update,
'INDEX_ONLY': index_only, 'INDEX_ONLY': index_only,
@@ -88,15 +92,13 @@ def add(urls: str | list[str],
} }
) )
# 3. Create a new Crawl pointing to the Seed (status=queued)
crawl = Crawl.from_seed(seed, max_depth=depth)
print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]') print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
print(f' [dim]Seed: {seed.uri}[/dim]') first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
print(f' [dim]First URL: {first_url}[/dim]')
# 4. The CrawlMachine will create the root Snapshot when started # 3. The CrawlMachine will create the root Snapshot when started
# Root snapshot URL = file:///path/to/sources/...txt # If URLs are from a file: first URL = file:///path/to/sources/...txt
# Parser extractors will run on it and discover URLs # Parser extractors will run on it and discover more URLs
# Those URLs become child Snapshots (depth=1) # Those URLs become child Snapshots (depth=1)
if index_only: if index_only:

View File

@@ -76,7 +76,7 @@ def discover_outlinks(
) )
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk
from core.models import Snapshot, ArchiveResult from core.models import Snapshot, ArchiveResult
from crawls.models import Seed, Crawl from crawls.models import Crawl
from archivebox.config import CONSTANTS from archivebox.config import CONSTANTS
from workers.orchestrator import Orchestrator from workers.orchestrator import Orchestrator
@@ -117,12 +117,12 @@ def discover_outlinks(
sources_file.parent.mkdir(parents=True, exist_ok=True) sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url'))) sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url')))
seed = Seed.from_file( crawl = Crawl.from_file(
sources_file, sources_file,
max_depth=depth,
label=f'crawl --depth={depth}', label=f'crawl --depth={depth}',
created_by=created_by_id, created_by=created_by_id,
) )
crawl = Crawl.from_seed(seed, max_depth=depth)
# Create snapshots for new URLs # Create snapshots for new URLs
for record in new_url_records: for record in new_url_records:

View File

@@ -42,27 +42,20 @@ def install(dry_run: bool=False) -> None:
setup_django() setup_django()
from django.utils import timezone from django.utils import timezone
from crawls.models import Seed, Crawl from crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk
# Create a seed and crawl for dependency detection # Create a crawl for dependency detection
# Using a minimal crawl that will trigger on_Crawl hooks # Using a minimal crawl that will trigger on_Crawl hooks
created_by_id = get_or_create_system_user_pk() created_by_id = get_or_create_system_user_pk()
seed, _created = Seed.objects.get_or_create( crawl, created = Crawl.objects.get_or_create(
uri='archivebox://install', urls='archivebox://install',
label='Dependency detection', label='Dependency detection',
created_by_id=created_by_id, created_by_id=created_by_id,
defaults={ defaults={
'extractor': 'auto', 'extractor': 'auto',
} 'max_depth': 0,
)
crawl, created = Crawl.objects.get_or_create(
seed=seed,
max_depth=0,
created_by_id=created_by_id,
defaults={
'status': 'queued', 'status': 'queued',
} }
) )

View File

@@ -92,7 +92,7 @@ def create_snapshots(
) )
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk
from core.models import Snapshot from core.models import Snapshot
from crawls.models import Seed, Crawl from crawls.models import Crawl
from archivebox.config import CONSTANTS from archivebox.config import CONSTANTS
created_by_id = created_by_id or get_or_create_system_user_pk() created_by_id = created_by_id or get_or_create_system_user_pk()
@@ -108,17 +108,17 @@ def create_snapshots(
# If depth > 0, we need a Crawl to manage recursive discovery # If depth > 0, we need a Crawl to manage recursive discovery
crawl = None crawl = None
if depth > 0: if depth > 0:
# Create a seed for this batch # Create a crawl for this batch
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt' sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt'
sources_file.parent.mkdir(parents=True, exist_ok=True) sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url'))) sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url')))
seed = Seed.from_file( crawl = Crawl.from_file(
sources_file, sources_file,
max_depth=depth,
label=f'snapshot --depth={depth}', label=f'snapshot --depth={depth}',
created_by=created_by_id, created_by=created_by_id,
) )
crawl = Crawl.from_seed(seed, max_depth=depth)
# Process each record # Process each record
created_snapshots = [] created_snapshots = []

View File

@@ -111,53 +111,27 @@ def version(quiet: bool=False,
machine = Machine.current() machine = Machine.current()
# Get all *_BINARY config values # Get all installed binaries from the database
binary_config_keys = [key for key in config.keys() if key.endswith('_BINARY')] all_installed = InstalledBinary.objects.filter(
machine=machine
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
if not binary_config_keys: if not all_installed.exists():
prnt('', '[grey53]No binary dependencies defined in config.[/grey53]') prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
else: else:
for key in sorted(set(binary_config_keys)): for installed in all_installed:
# Get the actual binary name/path from config value # Skip if user specified specific binaries and this isn't one
# Prioritize Machine.config overrides over base config if binaries and installed.name not in binaries:
bin_value = machine.config.get(key) or config.get(key, '').strip()
if not bin_value:
continue continue
# Check if it's a path (has slashes) or just a name if installed.is_valid:
is_path = '/' in str(bin_value)
if is_path:
# It's a full path - match against abspath
bin_name = Path(bin_value).name
# Skip if user specified specific binaries and this isn't one
if binaries and bin_name not in binaries:
continue
# Find InstalledBinary where abspath ends with this path
installed = InstalledBinary.objects.filter(
machine=machine,
abspath__endswith=bin_value,
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
else:
# It's just a binary name - match against name
bin_name = bin_value
# Skip if user specified specific binaries and this isn't one
if binaries and bin_name not in binaries:
continue
# Find InstalledBinary by name
installed = InstalledBinary.objects.filter(
machine=machine,
name__iexact=bin_name,
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
if installed and installed.is_valid:
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~') display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
version_str = (installed.version or 'unknown')[:15] version_str = (installed.version or 'unknown')[:15]
provider = (installed.binprovider or 'env')[:8] provider = (installed.binprovider or 'env')[:8]
prnt('', '[green]√[/green]', '', bin_name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False) prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
else: else:
prnt('', '[red]X[/red]', '', bin_name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False) prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
failures.append(bin_name) failures.append(installed.name)
# Show hint if no binaries are installed yet # Show hint if no binaries are installed yet
has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists() has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()

View File

@@ -96,10 +96,8 @@ class ConstantsDict(Mapping):
# Data dir files # Data dir files
CONFIG_FILENAME: str = 'ArchiveBox.conf' CONFIG_FILENAME: str = 'ArchiveBox.conf'
SQL_INDEX_FILENAME: str = 'index.sqlite3' SQL_INDEX_FILENAME: str = 'index.sqlite3'
QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
JSON_INDEX_FILENAME: str = 'index.json' JSON_INDEX_FILENAME: str = 'index.json'
HTML_INDEX_FILENAME: str = 'index.html' HTML_INDEX_FILENAME: str = 'index.html'
@@ -184,10 +182,10 @@ class ConstantsDict(Mapping):
SQL_INDEX_FILENAME, SQL_INDEX_FILENAME,
f"{SQL_INDEX_FILENAME}-wal", f"{SQL_INDEX_FILENAME}-wal",
f"{SQL_INDEX_FILENAME}-shm", f"{SQL_INDEX_FILENAME}-shm",
QUEUE_DATABASE_FILENAME,
f"{QUEUE_DATABASE_FILENAME}-wal",
f"{QUEUE_DATABASE_FILENAME}-shm",
"search.sqlite3", "search.sqlite3",
"queue.sqlite3",
"queue.sqlite3-wal",
"queue.sqlite3-shm",
JSON_INDEX_FILENAME, JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME, HTML_INDEX_FILENAME,
ROBOTS_TXT_FILENAME, ROBOTS_TXT_FILENAME,

View File

@@ -56,6 +56,14 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null') os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null') os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
# Suppress the "database access during app initialization" warning
# This warning can be triggered during django.setup() but is safe to ignore
# since we're doing intentional setup operations
import warnings
warnings.filterwarnings('ignore',
message='.*Accessing the database during app initialization.*',
category=RuntimeWarning)
try: try:
from django.core.management import call_command from django.core.management import call_command
@@ -87,7 +95,8 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
style='bold red', style='bold red',
)) ))
STDERR.print() STDERR.print()
STDERR.print_exception(show_locals=False) import traceback
traceback.print_exc()
return return
from django.conf import settings from django.conf import settings

View File

@@ -224,12 +224,6 @@ def get_data_locations():
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK), "is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(DATABASE_FILE.resolve()), "is_mount": os.path.ismount(DATABASE_FILE.resolve()),
}, },
"QUEUE_DATABASE": {
"path": CONSTANTS.QUEUE_DATABASE_FILE,
"enabled": True,
"is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE),
},
"ARCHIVE_DIR": { "ARCHIVE_DIR": {
"path": ARCHIVE_DIR.resolve(), "path": ARCHIVE_DIR.resolve(),
"enabled": True, "enabled": True,

View File

@@ -33,15 +33,18 @@ GLOBAL_CONTEXT = {}
class SnapshotActionForm(ActionForm): class SnapshotActionForm(ActionForm):
tags = forms.ModelMultipleChoiceField( def __init__(self, *args, **kwargs):
label='Edit tags', super().__init__(*args, **kwargs)
queryset=Tag.objects.all(), # Define tags field in __init__ to avoid database access during app initialization
required=False, self.fields['tags'] = forms.ModelMultipleChoiceField(
widget=FilteredSelectMultiple( label='Edit tags',
'core_tag__name', queryset=Tag.objects.all(),
False, required=False,
), widget=FilteredSelectMultiple(
) 'core_tag__name',
False,
),
)
# TODO: allow selecting actions for specific extractors? is this useful? # TODO: allow selecting actions for specific extractors? is this useful?
# extractor = forms.ChoiceField( # extractor = forms.ChoiceField(
@@ -165,14 +168,69 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def admin_actions(self, obj): def admin_actions(self, obj):
return format_html( return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
''' '''
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}">Summary page ➡️</a> &nbsp; &nbsp; <div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}/index.html#all">Result files 📑</a> &nbsp; &nbsp; <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/admin/core/snapshot/?id__exact={}">Admin actions ⚙️</a> href="/archive/{}"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📄 Summary Page
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/archive/{}/index.html#all"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📁 Result Files
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="{}"
target="_blank"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
🔗 Original URL
</a>
<span style="border-left: 1px solid #e2e8f0; height: 24px; margin: 0 4px;"></span>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Get missing extractors"
onmouseover="this.style.background='#d1fae5';"
onmouseout="this.style.background='#ecfdf5';">
⬇️ Get Missing
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #eff6ff; border: 1px solid #bfdbfe; border-radius: 8px; color: #1e40af; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Create a fresh new snapshot of this URL"
onmouseover="this.style.background='#dbeafe';"
onmouseout="this.style.background='#eff6ff';">
🆕 Archive Again
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px; color: #92400e; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Re-run all extractors (overwrite existing)"
onmouseover="this.style.background='#fef3c7';"
onmouseout="this.style.background='#fffbeb';">
🔄 Redo All
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fef2f2; border: 1px solid #fecaca; border-radius: 8px; color: #991b1b; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Permanently delete this snapshot"
onmouseover="this.style.background='#fee2e2';"
onmouseout="this.style.background='#fef2f2';">
☠️ Delete
</a>
</div>
<p style="margin-top: 12px; font-size: 12px; color: #64748b;">
<b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
</p>
''', ''',
obj.timestamp, obj.timestamp,
obj.timestamp, obj.timestamp,
obj.url,
obj.pk,
obj.pk,
obj.pk,
obj.pk, obj.pk,
) )

View File

@@ -0,0 +1,101 @@
# Generated by Django 6.0 on 2025-12-25 09:34
import archivebox.base_models.models
import django.db.models.deletion
import django.utils.timezone
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0025_allow_duplicate_urls_per_crawl'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.RemoveField(
model_name='archiveresult',
name='output_dir',
),
migrations.RemoveField(
model_name='snapshot',
name='output_dir',
),
migrations.AlterField(
model_name='archiveresult',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(db_index=True, max_length=32),
),
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.AutoField(editable=False, primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
# migrations.AlterField(
# model_name='snapshot',
# name='tags',
# field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
# ),
migrations.AlterField(
model_name='snapshottag',
name='id',
field=models.AutoField(primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together={('snapshot', 'tag')},
),
]

View File

@@ -59,7 +59,7 @@ INSTALLED_APPS = [
"config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here) "config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
"machine", # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc. "machine", # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
"workers", # handles starting and managing background workers and processes (orchestrators and actors) "workers", # handles starting and managing background workers and processes (orchestrators and actors)
"crawls", # handles Seed, Crawl, and CrawlSchedule models and management "crawls", # handles Crawl and CrawlSchedule models and management
"personas", # handles Persona and session management "personas", # handles Persona and session management
"core", # core django model with Snapshot, ArchiveResult, etc. "core", # core django model with Snapshot, ArchiveResult, etc.
"api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc. "api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
@@ -194,10 +194,6 @@ DATABASES = {
"NAME": DATABASE_NAME, "NAME": DATABASE_NAME,
**SQLITE_CONNECTION_OPTIONS, **SQLITE_CONNECTION_OPTIONS,
}, },
"queue": {
"NAME": CONSTANTS.QUEUE_DATABASE_FILE,
**SQLITE_CONNECTION_OPTIONS,
},
# "filestore": { # "filestore": {
# "NAME": CONSTANTS.FILESTORE_DATABASE_FILE, # "NAME": CONSTANTS.FILESTORE_DATABASE_FILE,
# **SQLITE_CONNECTION_OPTIONS, # **SQLITE_CONNECTION_OPTIONS,

View File

@@ -2,8 +2,6 @@ __package__ = 'archivebox.core'
import re import re
import os import os
import shutil
import tempfile import tempfile
import logging import logging
@@ -11,7 +9,6 @@ import pydantic
import django.template import django.template
from archivebox.config import CONSTANTS from archivebox.config import CONSTANTS
from archivebox.misc.logging import IS_TTY
IGNORABLE_URL_PATTERNS = [ IGNORABLE_URL_PATTERNS = [
@@ -79,7 +76,6 @@ SETTINGS_LOGGING = {
"formatters": { "formatters": {
"rich": { "rich": {
"datefmt": "[%Y-%m-%d %H:%M:%S]", "datefmt": "[%Y-%m-%d %H:%M:%S]",
# "format": "{asctime} {levelname} {module} {name} {message} {username}",
"format": "%(name)s %(message)s", "format": "%(name)s %(message)s",
}, },
"outbound_webhooks": { "outbound_webhooks": {
@@ -99,26 +95,13 @@ SETTINGS_LOGGING = {
}, },
}, },
"handlers": { "handlers": {
# "console": {
# "level": "DEBUG",
# 'formatter': 'simple',
# "class": "logging.StreamHandler",
# 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'],
# },
"default": { "default": {
"class": "rich.logging.RichHandler", "class": "rich.logging.RichHandler",
"formatter": "rich", "formatter": "rich",
"level": "DEBUG", "level": "DEBUG",
"markup": False, "markup": False,
"rich_tracebacks": IS_TTY, "rich_tracebacks": False, # Use standard Python tracebacks (no frame/box)
"filters": ["noisyrequestsfilter"], "filters": ["noisyrequestsfilter"],
"tracebacks_suppress": [
django,
pydantic,
],
"tracebacks_width": shutil.get_terminal_size((100, 10)).columns - 1,
"tracebacks_word_wrap": False,
"tracebacks_show_locals": False,
}, },
"logfile": { "logfile": {
"level": "INFO", "level": "INFO",
@@ -132,7 +115,7 @@ SETTINGS_LOGGING = {
"outbound_webhooks": { "outbound_webhooks": {
"class": "rich.logging.RichHandler", "class": "rich.logging.RichHandler",
"markup": False, "markup": False,
"rich_tracebacks": True, "rich_tracebacks": False, # Use standard Python tracebacks (no frame/box)
"formatter": "outbound_webhooks", "formatter": "outbound_webhooks",
}, },
# "mail_admins": { # "mail_admins": {

View File

@@ -15,7 +15,7 @@ from statemachine import State, StateMachine
# from workers.actor import ActorType # from workers.actor import ActorType
from core.models import Snapshot, ArchiveResult from core.models import Snapshot, ArchiveResult
from crawls.models import Crawl, Seed from crawls.models import Crawl
class SnapshotMachine(StateMachine, strict_states=True): class SnapshotMachine(StateMachine, strict_states=True):
@@ -247,17 +247,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
) )
self.archiveresult.save(write_indexes=True) self.archiveresult.save(write_indexes=True)
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1) ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1) Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
# Also update Crawl and Seed health stats if snapshot has a crawl # Also update Crawl health stats if snapshot has a crawl
snapshot = self.archiveresult.snapshot snapshot = self.archiveresult.snapshot
if snapshot.crawl_id: if snapshot.crawl_id:
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1) Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first()
if crawl:
Seed.objects.filter(pk=crawl).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
@failed.enter @failed.enter
def enter_failed(self): def enter_failed(self):
@@ -268,17 +265,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
end_ts=timezone.now(), end_ts=timezone.now(),
) )
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1) ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1) Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)
# Also update Crawl and Seed health stats if snapshot has a crawl # Also update Crawl health stats if snapshot has a crawl
snapshot = self.archiveresult.snapshot snapshot = self.archiveresult.snapshot
if snapshot.crawl_id: if snapshot.crawl_id:
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1) Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first()
if crawl:
Seed.objects.filter(pk=crawl).update(num_uses_failed=F('num_uses_failed') + 1)
@skipped.enter @skipped.enter
def enter_skipped(self): def enter_skipped(self):

View File

@@ -33,7 +33,7 @@ from archivebox.search import query_search_index
from core.models import Snapshot from core.models import Snapshot
from core.forms import AddLinkForm from core.forms import AddLinkForm
from crawls.models import Seed, Crawl from crawls.models import Crawl
from archivebox.hooks import get_extractors, get_extractor_name from archivebox.hooks import get_extractors, get_extractor_name
@@ -119,7 +119,11 @@ class SnapshotView(View):
if result_file.name in existing_files or result_file.name == 'index.html': if result_file.name in existing_files or result_file.name == 'index.html':
continue continue
file_size = result_file.stat().st_size or 0 # Skip circular symlinks and other stat() failures
try:
file_size = result_file.stat().st_size or 0
except OSError:
continue
if file_size > min_size_threshold: if file_size > min_size_threshold:
archiveresults[result_file.name] = { archiveresults[result_file.name] = {
@@ -471,14 +475,16 @@ class AddView(UserPassesTestMixin, FormView):
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt' sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls)) sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt # 2. create a new Crawl with the URLs from the file
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
seed = Seed.from_file( urls_content = sources_file.read_text()
sources_file, crawl = Crawl.objects.create(
urls=urls_content,
extractor=parser,
max_depth=depth,
tags_str=tag,
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}', label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
parser=parser, created_by_id=self.request.user.pk,
tag=tag,
created_by=self.request.user.pk,
config={ config={
# 'ONLY_NEW': not update, # 'ONLY_NEW': not update,
# 'INDEX_ONLY': index_only, # 'INDEX_ONLY': index_only,
@@ -486,9 +492,8 @@ class AddView(UserPassesTestMixin, FormView):
'DEPTH': depth, 'DEPTH': depth,
'EXTRACTORS': extractors or '', 'EXTRACTORS': extractors or '',
# 'DEFAULT_PERSONA': persona or 'Default', # 'DEFAULT_PERSONA': persona or 'Default',
}) }
# 3. create a new Crawl pointing to the Seed )
crawl = Crawl.from_seed(seed, max_depth=depth)
# 4. start the Orchestrator & wait until it completes # 4. start the Orchestrator & wait until it completes
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ... # ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
@@ -569,19 +574,7 @@ def live_progress_view(request):
# Count URLs in the crawl (for when snapshots haven't been created yet) # Count URLs in the crawl (for when snapshots haven't been created yet)
urls_count = 0 urls_count = 0
if crawl.urls: if crawl.urls:
urls_count = len([u for u in crawl.urls.split('\n') if u.strip()]) urls_count = len([u for u in crawl.urls.split('\n') if u.strip() and not u.startswith('#')])
elif crawl.seed and crawl.seed.uri:
# Try to get URL count from seed
if crawl.seed.uri.startswith('file:///'):
try:
from pathlib import Path
seed_file = Path(crawl.seed.uri.replace('file://', ''))
if seed_file.exists():
urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')])
except:
pass
else:
urls_count = 1 # Single URL seed
# Calculate crawl progress # Calculate crawl progress
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0 crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
@@ -635,8 +628,8 @@ def live_progress_view(request):
}) })
# Check if crawl can start (for debugging stuck crawls) # Check if crawl can start (for debugging stuck crawls)
can_start = bool(crawl.seed and crawl.seed.uri) can_start = bool(crawl.urls)
seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None urls_preview = crawl.urls[:60] if crawl.urls else None
# Check if retry_at is in the future (would prevent worker from claiming) # Check if retry_at is in the future (would prevent worker from claiming)
retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
@@ -657,7 +650,7 @@ def live_progress_view(request):
'pending_snapshots': pending_snapshots, 'pending_snapshots': pending_snapshots,
'active_snapshots': active_snapshots_for_crawl, 'active_snapshots': active_snapshots_for_crawl,
'can_start': can_start, 'can_start': can_start,
'seed_uri': seed_uri, 'urls_preview': urls_preview,
'retry_at_future': retry_at_future, 'retry_at_future': retry_at_future,
'seconds_until_retry': seconds_until_retry, 'seconds_until_retry': seconds_until_retry,
}) })

View File

@@ -17,7 +17,7 @@ from django_object_actions import action
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from core.models import Snapshot from core.models import Snapshot
from crawls.models import Seed, Crawl, CrawlSchedule from crawls.models import Crawl, CrawlSchedule
def render_snapshots_list(snapshots_qs, limit=20): def render_snapshots_list(snapshots_qs, limit=20):
@@ -136,16 +136,16 @@ def render_snapshots_list(snapshots_qs, limit=20):
''') ''')
class SeedAdmin(ConfigEditorMixin, BaseModelAdmin): class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots') list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str') sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at')
search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str') search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls')
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents') readonly_fields = ('created_at', 'modified_at', 'snapshots', 'urls_editor')
fieldsets = ( fieldsets = (
('Source', { ('URLs', {
'fields': ('uri', 'contents'), 'fields': ('urls_editor',),
'classes': ('card', 'wide'), 'classes': ('card', 'wide'),
}), }),
('Info', { ('Info', {
@@ -153,83 +153,7 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
'classes': ('card',), 'classes': ('card',),
}), }),
('Settings', { ('Settings', {
'fields': ('extractor', 'config'), 'fields': ('max_depth', 'extractor', 'config'),
'classes': ('card',),
}),
('Metadata', {
'fields': ('created_by', 'created_at', 'modified_at'),
'classes': ('card',),
}),
('Crawls', {
'fields': ('scheduled_crawls', 'crawls'),
'classes': ('card',),
}),
('Snapshots', {
'fields': ('snapshots',),
'classes': ('card',),
}),
)
list_filter = ('extractor', 'created_by')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
def num_crawls(self, obj):
return obj.crawl_set.count()
def num_snapshots(self, obj):
return obj.snapshot_set.count()
def scheduled_crawls(self, obj):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(scheduledcrawl.admin_change_url, scheduledcrawl)
for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
)) or mark_safe('<i>No Scheduled Crawls yet...</i>')
def crawls(self, obj):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(crawl.admin_change_url, crawl)
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
)) or mark_safe('<i>No Crawls yet...</i>')
def snapshots(self, obj):
return render_snapshots_list(obj.snapshot_set.all())
def contents(self, obj):
source_file = obj.get_file_path()
if source_file:
contents = ""
try:
contents = source_file.read_text().strip()[:14_000]
except Exception as e:
contents = f'Error reading {source_file}: {e}'
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
fieldsets = (
('URLs', {
'fields': ('seed_urls_editor',),
'classes': ('card', 'wide'),
}),
('Info', {
'fields': ('label', 'notes'),
'classes': ('card',),
}),
('Settings', {
'fields': ('max_depth', 'config'),
'classes': ('card',), 'classes': ('card',),
}), }),
('Status', { ('Status', {
@@ -237,7 +161,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
'classes': ('card',), 'classes': ('card',),
}), }),
('Relations', { ('Relations', {
'fields': ('seed', 'schedule', 'created_by'), 'fields': ('schedule', 'created_by'),
'classes': ('card',), 'classes': ('card',),
}), }),
('Timestamps', { ('Timestamps', {
@@ -250,7 +174,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
}), }),
) )
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at') list_filter = ('max_depth', 'extractor', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at'] ordering = ['-created_at', '-retry_at']
list_per_page = 100 list_per_page = 100
actions = ["delete_selected"] actions = ["delete_selected"]
@@ -258,23 +182,20 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
@action(label='Recrawl', description='Create a new crawl with the same settings') @action(label='Recrawl', description='Create a new crawl with the same settings')
def recrawl(self, request, obj): def recrawl(self, request, obj):
"""Duplicate this crawl as a new crawl with the same seed and settings.""" """Duplicate this crawl as a new crawl with the same URLs and settings."""
from django.utils import timezone from django.utils import timezone
from django.shortcuts import redirect from django.shortcuts import redirect
# Validate seed has a URI (required for crawl to start) # Validate URLs (required for crawl to start)
if not obj.seed: if not obj.urls:
messages.error(request, 'Cannot recrawl: original crawl has no seed.') messages.error(request, 'Cannot recrawl: original crawl has no URLs.')
return redirect('admin:crawls_crawl_change', obj.id)
if not obj.seed.uri:
messages.error(request, 'Cannot recrawl: seed has no URI.')
return redirect('admin:crawls_crawl_change', obj.id) return redirect('admin:crawls_crawl_change', obj.id)
new_crawl = Crawl.objects.create( new_crawl = Crawl.objects.create(
seed=obj.seed,
urls=obj.urls, urls=obj.urls,
extractor=obj.extractor,
max_depth=obj.max_depth, max_depth=obj.max_depth,
tags_str=obj.tags_str,
config=obj.config, config=obj.config,
schedule=obj.schedule, schedule=obj.schedule,
label=f"{obj.label} (recrawl)" if obj.label else "", label=f"{obj.label} (recrawl)" if obj.label else "",
@@ -292,43 +213,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
return redirect('admin:crawls_crawl_change', new_crawl.id) return redirect('admin:crawls_crawl_change', new_crawl.id)
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path('<path:object_id>/save_seed_contents/',
self.admin_site.admin_view(self.save_seed_contents_view),
name='crawls_crawl_save_seed_contents'),
]
return custom_urls + urls
def save_seed_contents_view(self, request, object_id):
"""Handle saving seed file contents via AJAX."""
if request.method != 'POST':
return JsonResponse({'success': False, 'error': 'POST required'}, status=405)
try:
crawl = Crawl.objects.get(pk=object_id)
except Crawl.DoesNotExist:
return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
source_file = crawl.seed.get_file_path() if crawl.seed else None
if not source_file:
return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
try:
data = json.loads(request.body)
contents = data.get('contents', '')
except json.JSONDecodeError:
return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
try:
# Ensure parent directory exists
source_file.parent.mkdir(parents=True, exist_ok=True)
source_file.write_text(contents)
return JsonResponse({'success': True, 'message': f'Saved {len(contents)} bytes to {source_file.name}'})
except Exception as e:
return JsonResponse({'success': False, 'error': str(e)}, status=500)
def num_snapshots(self, obj): def num_snapshots(self, obj):
return obj.snapshot_set.count() return obj.snapshot_set.count()
@@ -341,163 +225,68 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
return mark_safe('<i>None</i>') return mark_safe('<i>None</i>')
return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule) return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)
@admin.display(description='Seed', ordering='seed') @admin.display(description='URLs', ordering='urls')
def seed_str(self, obj): def urls_preview(self, obj):
if not obj.seed: first_url = obj.get_urls_list()[0] if obj.get_urls_list() else ''
return mark_safe('<i>None</i>') return first_url[:80] + '...' if len(first_url) > 80 else first_url
return format_html('<a href="{}">{}</a>', obj.seed.admin_change_url, obj.seed)
@admin.display(description='URLs') @admin.display(description='URLs')
def seed_urls_editor(self, obj): def urls_editor(self, obj):
"""Combined editor showing seed URL and file contents.""" """Editor for crawl URLs."""
widget_id = f'seed_urls_{obj.pk}' widget_id = f'crawl_urls_{obj.pk}'
# Get the seed URI (or use urls field if no seed)
seed_uri = ''
if obj.seed and obj.seed.uri:
seed_uri = obj.seed.uri
elif obj.urls:
seed_uri = obj.urls
# Check if it's a local file we can edit # Check if it's a local file we can edit
source_file = obj.seed.get_file_path() if obj.seed else None source_file = obj.get_file_path()
is_file = source_file is not None is_file = source_file is not None
contents = "" file_contents = ""
error = None error = None
if is_file and source_file: if is_file and source_file:
try: try:
contents = source_file.read_text().strip() file_contents = source_file.read_text().strip()
except Exception as e: except Exception as e:
error = f'Error reading {source_file}: {e}' error = f'Error reading {source_file}: {e}'
# Escape for safe HTML embedding # Escape for safe HTML embedding
escaped_uri = seed_uri.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;') escaped_urls = (obj.urls or '').replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
escaped_contents = (contents or '').replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;') escaped_file_contents = file_contents.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
# Count lines for auto-expand logic # Count lines for auto-expand logic
line_count = len(contents.split('\n')) if contents else 0 line_count = len((obj.urls or '').split('\n'))
uri_rows = min(max(1, seed_uri.count('\n') + 1), 3) file_line_count = len(file_contents.split('\n')) if file_contents else 0
uri_rows = min(max(3, line_count), 10)
html = f''' html = f'''
<div id="{widget_id}_container" style="max-width: 900px;"> <div id="{widget_id}_container" style="max-width: 900px;">
<!-- Seed URL input (auto-expands) --> <!-- URLs input -->
<div style="margin-bottom: 12px;"> <div style="margin-bottom: 12px;">
<label style="font-weight: bold; display: block; margin-bottom: 4px;">Seed URL:</label> <label style="font-weight: bold; display: block; margin-bottom: 4px;">URLs (one per line):</label>
<textarea id="{widget_id}_uri" <textarea id="{widget_id}_urls"
style="width: 100%; font-family: monospace; font-size: 13px; style="width: 100%; font-family: monospace; font-size: 13px;
padding: 8px; border: 1px solid #ccc; border-radius: 4px; padding: 8px; border: 1px solid #ccc; border-radius: 4px;
resize: vertical; min-height: 32px; overflow: hidden;" resize: vertical;"
rows="{uri_rows}" rows="{uri_rows}"
placeholder="file:///data/sources/... or https://..." placeholder="https://example.com&#10;https://example2.com&#10;# Comments start with #"
{"readonly" if not obj.pk else ""}>{escaped_uri}</textarea> readonly>{escaped_urls}</textarea>
<p style="color: #666; font-size: 12px; margin: 4px 0 0 0;">
{line_count} URL{'s' if line_count != 1 else ''} · URLs are read-only in admin, edit via API or CLI
</p>
</div> </div>
{"" if not is_file else f''' {"" if not is_file else f'''
<!-- File contents editor --> <!-- File contents preview (if first URL is a file://) -->
<div style="margin-bottom: 8px;"> <div style="margin-bottom: 8px;">
<label style="font-weight: bold; display: block; margin-bottom: 4px;"> <label style="font-weight: bold; display: block; margin-bottom: 4px;">
File Contents: <code style="font-weight: normal; color: #666;">{source_file}</code> File Preview: <code style="font-weight: normal; color: #666;">{source_file}</code>
</label> </label>
{"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""} {"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""}
<textarea id="{widget_id}_contents" <textarea id="{widget_id}_file_preview"
style="width: 100%; height: {min(400, max(150, line_count * 18))}px; font-family: monospace; font-size: 12px; style="width: 100%; height: {min(400, max(150, file_line_count * 18))}px; font-family: monospace; font-size: 12px;
padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical;" padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical; background: #f9f9f9;"
placeholder="Enter URLs, one per line...">{escaped_contents}</textarea> readonly>{escaped_file_contents}</textarea>
</div>
<div style="display: flex; gap: 12px; align-items: center; flex-wrap: wrap;">
<button type="button" id="{widget_id}_save_btn"
onclick="saveSeedUrls_{widget_id}()"
style="padding: 8px 20px; background: #417690; color: white; border: none;
border-radius: 4px; cursor: pointer; font-weight: bold;">
Save URLs
</button>
<span id="{widget_id}_line_count" style="color: #666; font-size: 12px;"></span>
<span id="{widget_id}_status" style="color: #666; font-size: 12px;"></span>
</div> </div>
'''} '''}
{"" if is_file else f'''
<div style="margin-top: 8px; color: #666;">
<a href="{seed_uri}" target="_blank">{seed_uri}</a>
</div>
'''}
<script>
(function() {{
var uriInput = document.getElementById('{widget_id}_uri');
var contentsInput = document.getElementById('{widget_id}_contents');
var status = document.getElementById('{widget_id}_status');
var lineCount = document.getElementById('{widget_id}_line_count');
var saveBtn = document.getElementById('{widget_id}_save_btn');
// Auto-resize URI input
function autoResizeUri() {{
uriInput.style.height = 'auto';
uriInput.style.height = Math.min(100, uriInput.scrollHeight) + 'px';
}}
uriInput.addEventListener('input', autoResizeUri);
autoResizeUri();
if (contentsInput) {{
function updateLineCount() {{
var lines = contentsInput.value.split('\\n').filter(function(l) {{ return l.trim(); }});
lineCount.textContent = lines.length + ' URLs';
}}
contentsInput.addEventListener('input', function() {{
updateLineCount();
if (status) {{
status.textContent = '(unsaved changes)';
status.style.color = '#c4820e';
}}
}});
updateLineCount();
}}
window.saveSeedUrls_{widget_id} = function() {{
if (!saveBtn) return;
saveBtn.disabled = true;
saveBtn.textContent = 'Saving...';
if (status) status.textContent = '';
fetch(window.location.pathname + 'save_seed_contents/', {{
method: 'POST',
headers: {{
'Content-Type': 'application/json',
'X-CSRFToken': document.querySelector('[name=csrfmiddlewaretoken]').value
}},
body: JSON.stringify({{ contents: contentsInput ? contentsInput.value : '' }})
}})
.then(function(response) {{ return response.json(); }})
.then(function(data) {{
if (data.success) {{
if (status) {{
status.textContent = '' + data.message;
status.style.color = '#28a745';
}}
}} else {{
if (status) {{
status.textContent = '' + data.error;
status.style.color = '#dc3545';
}}
}}
}})
.catch(function(err) {{
if (status) {{
status.textContent = '✗ Error: ' + err;
status.style.color = '#dc3545';
}}
}})
.finally(function() {{
saveBtn.disabled = false;
saveBtn.textContent = 'Save URLs';
}});
}};
}})();
</script>
</div> </div>
''' '''
return mark_safe(html) return mark_safe(html)
@@ -507,7 +296,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
class CrawlScheduleAdmin(BaseModelAdmin): class CrawlScheduleAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots') list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots')
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str') sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str')
search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri') search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__urls')
readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots') readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots')
@@ -561,6 +350,5 @@ class CrawlScheduleAdmin(BaseModelAdmin):
def register_admin(admin_site): def register_admin(admin_site):
admin_site.register(Seed, SeedAdmin)
admin_site.register(Crawl, CrawlAdmin) admin_site.register(Crawl, CrawlAdmin)
admin_site.register(CrawlSchedule, CrawlScheduleAdmin) admin_site.register(CrawlSchedule, CrawlScheduleAdmin)

View File

@@ -0,0 +1,61 @@
# Generated by Django 6.0 on 2025-12-25 09:34
import archivebox.base_models.models
import django.db.models.deletion
import pathlib
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('crawls', '0001_initial'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.RemoveField(
model_name='crawl',
name='seed',
),
migrations.AddField(
model_name='crawl',
name='extractor',
field=models.CharField(default='auto', help_text='Parser for reading URLs (auto, html, json, rss, etc)', max_length=32),
),
migrations.AlterField(
model_name='crawl',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='crawl',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='crawl',
name='output_dir',
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/data/archive')),
),
migrations.AlterField(
model_name='crawl',
name='urls',
field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
),
migrations.AlterField(
model_name='crawlschedule',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='crawlschedule',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.DeleteModel(
name='Seed',
),
]

View File

@@ -20,91 +20,6 @@ if TYPE_CHECKING:
from core.models import Snapshot, ArchiveResult from core.models import Snapshot, ArchiveResult
class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats):
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
modified_at = models.DateTimeField(auto_now=True)
uri = models.URLField(max_length=2048)
extractor = models.CharField(default='auto', max_length=32)
tags_str = models.CharField(max_length=255, null=False, blank=True, default='')
label = models.CharField(max_length=255, null=False, blank=True, default='')
config = models.JSONField(default=dict)
output_dir = models.FilePathField(path=settings.ARCHIVE_DIR, null=False, blank=True, default='')
notes = models.TextField(blank=True, null=False, default='')
crawl_set: models.Manager['Crawl']
class Meta:
verbose_name = 'Seed'
verbose_name_plural = 'Seeds'
unique_together = (('created_by', 'uri', 'extractor'), ('created_by', 'label'))
def __str__(self):
return f'[{self.id}] {self.uri[:64]}'
def save(self, *args, **kwargs):
is_new = self._state.adding
super().save(*args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created Seed',
indent_level=0,
metadata={
'id': str(self.id),
'uri': str(self.uri)[:64],
'extractor': self.extractor,
'label': self.label or None,
},
)
@classmethod
def from_file(cls, source_file: Path, label: str = '', parser: str = 'auto', tag: str = '', created_by=None, config=None):
# Use absolute path for file:// URLs so extractors can find the files
source_path = str(source_file.resolve())
seed, _ = cls.objects.get_or_create(
label=label or source_file.name, uri=f'file://{source_path}',
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
extractor=parser, tags_str=tag, config=config or {},
)
return seed
@property
def source_type(self):
return self.uri.split('://', 1)[0].lower()
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_seed', args=[self.id])
def get_file_path(self) -> Path | None:
"""
Get the filesystem path for file:// URIs.
Handles both old format (file:///data/...) and new format (file:///absolute/path).
Returns None if URI is not a file:// URI.
"""
if not self.uri.startswith('file://'):
return None
# Remove file:// prefix
path_str = self.uri.replace('file://', '', 1)
# Handle old format: file:///data/... -> DATA_DIR/...
if path_str.startswith('/data/'):
return CONSTANTS.DATA_DIR / path_str.replace('/data/', '', 1)
# Handle new format: file:///absolute/path
return Path(path_str)
@property
def snapshot_set(self) -> QuerySet['Snapshot']:
from core.models import Snapshot
return Snapshot.objects.filter(crawl_id__in=self.crawl_set.values_list('pk', flat=True))
class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats): class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True) created_at = models.DateTimeField(default=timezone.now, db_index=True)
@@ -124,14 +39,15 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
verbose_name_plural = 'Scheduled Crawls' verbose_name_plural = 'Scheduled Crawls'
def __str__(self) -> str: def __str__(self) -> str:
return f'[{self.id}] {self.template.seed.uri[:64] if self.template and self.template.seed else ""} @ {self.schedule}' urls_preview = self.template.urls[:64] if self.template and self.template.urls else ""
return f'[{self.id}] {urls_preview} @ {self.schedule}'
@property @property
def api_url(self) -> str: def api_url(self) -> str:
return reverse_lazy('api-1:get_any', args=[self.id]) return reverse_lazy('api-1:get_any', args=[self.id])
def save(self, *args, **kwargs): def save(self, *args, **kwargs):
self.label = self.label or (self.template.seed.label if self.template and self.template.seed else '') self.label = self.label or (self.template.label if self.template else '')
super().save(*args, **kwargs) super().save(*args, **kwargs)
if self.template: if self.template:
self.template.schedule = self self.template.schedule = self
@@ -144,8 +60,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
modified_at = models.DateTimeField(auto_now=True) modified_at = models.DateTimeField(auto_now=True)
seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False) urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl')
urls = models.TextField(blank=True, null=False, default='') extractor = models.CharField(default='auto', max_length=32, help_text='Parser for reading URLs (auto, html, json, rss, etc)')
config = models.JSONField(default=dict) config = models.JSONField(default=dict)
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)]) max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
tags_str = models.CharField(max_length=1024, blank=True, null=False, default='') tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
@@ -171,31 +87,40 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
verbose_name_plural = 'Crawls' verbose_name_plural = 'Crawls'
def __str__(self): def __str__(self):
return f'[{self.id}] {self.seed.uri[:64] if self.seed else ""}' first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
return f'[{self.id}] {first_url[:64]}'
def save(self, *args, **kwargs): def save(self, *args, **kwargs):
is_new = self._state.adding is_new = self._state.adding
super().save(*args, **kwargs) super().save(*args, **kwargs)
if is_new: if is_new:
from archivebox.misc.logging_util import log_worker_event from archivebox.misc.logging_util import log_worker_event
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
log_worker_event( log_worker_event(
worker_type='DB', worker_type='DB',
event='Created Crawl', event='Created Crawl',
indent_level=1, indent_level=1,
metadata={ metadata={
'id': str(self.id), 'id': str(self.id),
'seed_uri': str(self.seed.uri)[:64] if self.seed else None, 'first_url': first_url[:64],
'max_depth': self.max_depth, 'max_depth': self.max_depth,
'status': self.status, 'status': self.status,
}, },
) )
@classmethod @classmethod
def from_seed(cls, seed: Seed, max_depth: int = 0, persona: str = 'Default', tags_str: str = '', config=None, created_by=None): def from_file(cls, source_file: Path, max_depth: int = 0, label: str = '', extractor: str = 'auto',
crawl, _ = cls.objects.get_or_create( tags_str: str = '', config=None, created_by=None):
seed=seed, max_depth=max_depth, tags_str=tags_str or seed.tags_str, """Create a crawl from a file containing URLs."""
config=seed.config or config or {}, urls_content = source_file.read_text()
created_by_id=getattr(created_by, 'pk', created_by) or seed.created_by_id, crawl = cls.objects.create(
urls=urls_content,
extractor=extractor,
max_depth=max_depth,
tags_str=tags_str,
label=label or source_file.name,
config=config or {},
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
) )
return crawl return crawl
@@ -203,14 +128,47 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
def api_url(self) -> str: def api_url(self) -> str:
return reverse_lazy('api-1:get_crawl', args=[self.id]) return reverse_lazy('api-1:get_crawl', args=[self.id])
def get_urls_list(self) -> list[str]:
"""Get list of URLs from urls field, filtering out comments and empty lines."""
if not self.urls:
return []
return [
url.strip()
for url in self.urls.split('\n')
if url.strip() and not url.strip().startswith('#')
]
def get_file_path(self) -> Path | None:
"""
Get filesystem path if this crawl references a local file.
Checks if the first URL is a file:// URI.
"""
urls = self.get_urls_list()
if not urls:
return None
first_url = urls[0]
if not first_url.startswith('file://'):
return None
# Remove file:// prefix
path_str = first_url.replace('file://', '', 1)
return Path(path_str)
def create_root_snapshot(self) -> 'Snapshot': def create_root_snapshot(self) -> 'Snapshot':
from core.models import Snapshot from core.models import Snapshot
first_url = self.get_urls_list()[0] if self.get_urls_list() else None
if not first_url:
raise ValueError(f'Crawl {self.id} has no URLs to create root snapshot from')
try: try:
return Snapshot.objects.get(crawl=self, url=self.seed.uri) return Snapshot.objects.get(crawl=self, url=first_url)
except Snapshot.DoesNotExist: except Snapshot.DoesNotExist:
pass pass
root_snapshot, _ = Snapshot.objects.update_or_create( root_snapshot, _ = Snapshot.objects.update_or_create(
crawl=self, url=self.seed.uri, crawl=self, url=first_url,
defaults={ defaults={
'status': Snapshot.INITIAL_STATE, 'status': Snapshot.INITIAL_STATE,
'retry_at': timezone.now(), 'retry_at': timezone.now(),

View File

@@ -42,11 +42,12 @@ class CrawlMachine(StateMachine, strict_states=True):
return self.__repr__() return self.__repr__()
def can_start(self) -> bool: def can_start(self) -> bool:
if not self.crawl.seed: if not self.crawl.urls:
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no seed[/red]') print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
return False return False
if not self.crawl.seed.uri: urls_list = self.crawl.get_urls_list()
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: seed has no URI[/red]') if not urls_list:
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
return False return False
return True return True
@@ -121,13 +122,14 @@ class CrawlMachine(StateMachine, strict_states=True):
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
# Run all on_Crawl hooks # Run all on_Crawl hooks
first_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ''
results = run_hooks( results = run_hooks(
event_name='Crawl', event_name='Crawl',
output_dir=output_dir, output_dir=output_dir,
timeout=60, timeout=60,
config_objects=[self.crawl, self.crawl.seed] if self.crawl.seed else [self.crawl], config_objects=[self.crawl],
crawl_id=str(self.crawl.id), crawl_id=str(self.crawl.id),
seed_uri=self.crawl.seed.uri if self.crawl.seed else '', seed_uri=first_url,
) )
# Process hook results - parse JSONL output and create DB objects # Process hook results - parse JSONL output and create DB objects

View File

@@ -1,2 +0,0 @@
> /Users/squash/Local/Code/archiveboxes/archivebox-nue/archivebox/cli/archivebox_init.py --force; TS=2025-12-25__08:03:12 VERSION=0.9.0rc1 IN_DOCKER=False IS_TTY=False

View File

@@ -0,0 +1,65 @@
# Generated by Django 6.0 on 2025-12-25 09:34
import django.db.models.deletion
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('machine', '0001_squashed'),
]
operations = [
migrations.AlterField(
model_name='dependency',
name='bin_name',
field=models.CharField(db_index=True, help_text='Binary executable name (e.g., wget, yt-dlp, chromium)', max_length=63, unique=True),
),
migrations.AlterField(
model_name='dependency',
name='bin_providers',
field=models.CharField(default='*', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any', max_length=127),
),
migrations.AlterField(
model_name='dependency',
name='config',
field=models.JSONField(blank=True, default=dict, help_text='JSON map of env var config to use during install'),
),
migrations.AlterField(
model_name='dependency',
name='custom_cmds',
field=models.JSONField(blank=True, default=dict, help_text="JSON map of provider -> custom install command (e.g., {'apt': 'apt install -y wget'})"),
),
migrations.AlterField(
model_name='dependency',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='installedbinary',
name='dependency',
field=models.ForeignKey(blank=True, help_text='The Dependency this binary satisfies', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='installedbinary_set', to='machine.dependency'),
),
migrations.AlterField(
model_name='installedbinary',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='machine',
name='config',
field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)'),
),
migrations.AlterField(
model_name='machine',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='networkinterface',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
]

View File

@@ -27,10 +27,9 @@ TYPE_SNAPSHOT = 'Snapshot'
TYPE_ARCHIVERESULT = 'ArchiveResult' TYPE_ARCHIVERESULT = 'ArchiveResult'
TYPE_TAG = 'Tag' TYPE_TAG = 'Tag'
TYPE_CRAWL = 'Crawl' TYPE_CRAWL = 'Crawl'
TYPE_SEED = 'Seed'
TYPE_INSTALLEDBINARY = 'InstalledBinary' TYPE_INSTALLEDBINARY = 'InstalledBinary'
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_SEED, TYPE_INSTALLEDBINARY} VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_INSTALLEDBINARY}
def parse_line(line: str) -> Optional[Dict[str, Any]]: def parse_line(line: str) -> Optional[Dict[str, Any]]:
@@ -206,7 +205,8 @@ def crawl_to_jsonl(crawl) -> Dict[str, Any]:
return { return {
'type': TYPE_CRAWL, 'type': TYPE_CRAWL,
'id': str(crawl.id), 'id': str(crawl.id),
'seed_id': str(crawl.seed_id), 'urls': crawl.urls,
'extractor': crawl.extractor,
'status': crawl.status, 'status': crawl.status,
'max_depth': crawl.max_depth, 'max_depth': crawl.max_depth,
'created_at': crawl.created_at.isoformat() if crawl.created_at else None, 'created_at': crawl.created_at.isoformat() if crawl.created_at else None,

View File

@@ -13,9 +13,11 @@ from rich.console import Console
from rich.highlighter import Highlighter from rich.highlighter import Highlighter
# SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS # SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
CONSOLE = Console() # Disable wrapping - use soft_wrap=True and large width so text flows naturally
STDERR = Console(stderr=True) # Colors are preserved, just no hard line breaks inserted
IS_TTY = CONSOLE.is_interactive CONSOLE = Console(width=32768, soft_wrap=True, force_terminal=True)
STDERR = Console(stderr=True, width=32768, soft_wrap=True, force_terminal=True)
IS_TTY = sys.stdout.isatty()
class RainbowHighlighter(Highlighter): class RainbowHighlighter(Highlighter):
def highlight(self, text): def highlight(self, text):

View File

@@ -603,21 +603,17 @@ def log_worker_event(
# Build final message # Build final message
error_str = f' {type(error).__name__}: {error}' if error else '' error_str = f' {type(error).__name__}: {error}' if error else ''
# Build colored message - worker_label needs to be inside color tags
# But first we need to format the color tags separately from the worker label
from archivebox.misc.logging import CONSOLE from archivebox.misc.logging import CONSOLE
from rich.text import Text from rich.text import Text
# Create a Rich Text object for proper formatting # Create a Rich Text object for proper formatting
text = Text() text = Text()
text.append(indent) # Indentation text.append(indent)
# Append worker label and event with color
text.append(f'{worker_label} {event}{error_str}', style=color) text.append(f'{worker_label} {event}{error_str}', style=color)
# Append metadata without color (add separator if metadata exists)
if metadata_str: if metadata_str:
text.append(f' | {metadata_str}') text.append(f' | {metadata_str}')
CONSOLE.print(text) CONSOLE.print(text, soft_wrap=True)
@enforce_types @enforce_types

View File

@@ -1,7 +1,5 @@
__package__ = 'archivebox' __package__ = 'archivebox'
import sys
import shutil
import django import django
import pydantic import pydantic
@@ -20,14 +18,10 @@ timezone.utc = datetime.timezone.utc
# DjangoSignalWebhooksConfig.verbose_name = 'API' # DjangoSignalWebhooksConfig.verbose_name = 'API'
# Install rich for pretty tracebacks in console logs # Rich traceback handler disabled - it adds frames/boxes that wrap weirdly in log files
# https://rich.readthedocs.io/en/stable/traceback.html#traceback-handler # Standard Python tracebacks are used instead (full width, no frames)
# from rich.traceback import install
from rich.traceback import install # noqa # install(show_locals=True, word_wrap=False, ...)
TERM_WIDTH = (shutil.get_terminal_size((200, 10)).columns - 1) if sys.stdout.isatty() else 200
# os.environ.setdefault('COLUMNS', str(TERM_WIDTH))
install(show_locals=True, word_wrap=False, locals_max_length=10, locals_hide_dunder=True, suppress=[django, pydantic], extra_lines=2, width=TERM_WIDTH)
# Hide site-packages/sonic/client.py:115: SyntaxWarning # Hide site-packages/sonic/client.py:115: SyntaxWarning

View File

@@ -552,21 +552,21 @@
if (crawl.status === 'queued' && !crawl.can_start) { if (crawl.status === 'queued' && !crawl.can_start) {
warningHtml = ` warningHtml = `
<div style="padding: 8px 14px; background: rgba(248, 81, 73, 0.1); border-top: 1px solid #f85149; color: #f85149; font-size: 11px;"> <div style="padding: 8px 14px; background: rgba(248, 81, 73, 0.1); border-top: 1px solid #f85149; color: #f85149; font-size: 11px;">
⚠️ Crawl cannot start: ${crawl.seed_uri ? 'unknown error' : 'no seed URI'} ⚠️ Crawl cannot start: ${crawl.urls_preview ? 'unknown error' : 'no URLs'}
</div> </div>
`; `;
} else if (crawl.status === 'queued' && crawl.retry_at_future) { } else if (crawl.status === 'queued' && crawl.retry_at_future) {
// Queued but retry_at is in future (was claimed by worker, will retry) // Queued but retry_at is in future (was claimed by worker, will retry)
warningHtml = ` warningHtml = `
<div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;"> <div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;">
🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''} 🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
</div> </div>
`; `;
} else if (crawl.status === 'queued' && crawl.total_snapshots === 0) { } else if (crawl.status === 'queued' && crawl.total_snapshots === 0) {
// Queued and waiting to be picked up by worker // Queued and waiting to be picked up by worker
warningHtml = ` warningHtml = `
<div style="padding: 8px 14px; background: rgba(210, 153, 34, 0.1); border-top: 1px solid #d29922; color: #d29922; font-size: 11px;"> <div style="padding: 8px 14px; background: rgba(210, 153, 34, 0.1); border-top: 1px solid #d29922; color: #d29922; font-size: 11px;">
⏳ Waiting for worker to pick up...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''} ⏳ Waiting for worker to pick up...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
</div> </div>
`; `;
} }
@@ -577,8 +577,8 @@
metaText += ` | ${crawl.total_snapshots} snapshots`; metaText += ` | ${crawl.total_snapshots} snapshots`;
} else if (crawl.urls_count > 0) { } else if (crawl.urls_count > 0) {
metaText += ` | ${crawl.urls_count} URLs`; metaText += ` | ${crawl.urls_count} URLs`;
} else if (crawl.seed_uri) { } else if (crawl.urls_preview) {
metaText += ` | ${crawl.seed_uri.substring(0, 40)}${crawl.seed_uri.length > 40 ? '...' : ''}`; metaText += ` | ${crawl.urls_preview.substring(0, 40)}${crawl.urls_preview.length > 40 ? '...' : ''}`;
} }
return ` return `

View File

@@ -26,6 +26,9 @@ CONFIG_FILE_NAME = "supervisord.conf"
PID_FILE_NAME = "supervisord.pid" PID_FILE_NAME = "supervisord.pid"
WORKERS_DIR_NAME = "workers" WORKERS_DIR_NAME = "workers"
# Global reference to supervisord process for cleanup
_supervisord_proc = None
ORCHESTRATOR_WORKER = { ORCHESTRATOR_WORKER = {
"name": "worker_orchestrator", "name": "worker_orchestrator",
"command": "archivebox manage orchestrator", # runs forever by default "command": "archivebox manage orchestrator", # runs forever by default
@@ -78,7 +81,7 @@ def create_supervisord_config():
config_content = f""" config_content = f"""
[supervisord] [supervisord]
nodaemon = true nodaemon = true
environment = IS_SUPERVISORD_PARENT="true" environment = IS_SUPERVISORD_PARENT="true",COLUMNS="200"
pidfile = {PID_FILE} pidfile = {PID_FILE}
logfile = {LOG_FILE} logfile = {LOG_FILE}
childlogdir = {CONSTANTS.LOGS_DIR} childlogdir = {CONSTANTS.LOGS_DIR}
@@ -143,11 +146,27 @@ def get_existing_supervisord_process():
return None return None
def stop_existing_supervisord_process(): def stop_existing_supervisord_process():
global _supervisord_proc
SOCK_FILE = get_sock_file() SOCK_FILE = get_sock_file()
PID_FILE = SOCK_FILE.parent / PID_FILE_NAME PID_FILE = SOCK_FILE.parent / PID_FILE_NAME
try: try:
# if pid file exists, load PID int # First try to stop via the global proc reference
if _supervisord_proc and _supervisord_proc.poll() is None:
try:
print(f"[🦸‍♂️] Stopping supervisord process (pid={_supervisord_proc.pid})...")
_supervisord_proc.terminate()
try:
_supervisord_proc.wait(timeout=5)
except subprocess.TimeoutExpired:
_supervisord_proc.kill()
_supervisord_proc.wait(timeout=2)
except (BaseException, BrokenPipeError, IOError, KeyboardInterrupt):
pass
_supervisord_proc = None
return
# Fallback: if pid file exists, load PID int and kill that process
try: try:
pid = int(PID_FILE.read_text()) pid = int(PID_FILE.read_text())
except (FileNotFoundError, ValueError): except (FileNotFoundError, ValueError):
@@ -156,8 +175,25 @@ def stop_existing_supervisord_process():
try: try:
print(f"[🦸‍♂️] Stopping supervisord process (pid={pid})...") print(f"[🦸‍♂️] Stopping supervisord process (pid={pid})...")
proc = psutil.Process(pid) proc = psutil.Process(pid)
# Kill the entire process group to ensure all children are stopped
children = proc.children(recursive=True)
proc.terminate() proc.terminate()
# Also terminate all children
for child in children:
try:
child.terminate()
except psutil.NoSuchProcess:
pass
proc.wait(timeout=5) proc.wait(timeout=5)
# Kill any remaining children
for child in children:
try:
if child.is_running():
child.kill()
except psutil.NoSuchProcess:
pass
except psutil.NoSuchProcess:
pass
except (BaseException, BrokenPipeError, IOError, KeyboardInterrupt): except (BaseException, BrokenPipeError, IOError, KeyboardInterrupt):
pass pass
finally: finally:
@@ -192,40 +228,44 @@ def start_new_supervisord_process(daemonize=False):
# create the supervisord config file # create the supervisord config file
create_supervisord_config() create_supervisord_config()
# Start supervisord # Open log file for supervisord output
# panel = Panel(f"Starting supervisord with config: {SUPERVISORD_CONFIG_FILE}") LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
# with Live(panel, refresh_per_second=1) as live: log_handle = open(LOG_FILE, 'a')
subprocess.Popen( if daemonize:
f"supervisord --configuration={CONFIG_FILE}", # Start supervisord in background (daemon mode)
stdin=None, subprocess.Popen(
shell=True, f"supervisord --configuration={CONFIG_FILE}",
start_new_session=daemonize, stdin=None,
) stdout=log_handle,
stderr=log_handle,
shell=True,
start_new_session=True,
)
time.sleep(2)
return get_existing_supervisord_process()
else:
# Start supervisord in FOREGROUND - this will block until supervisord exits
# supervisord with nodaemon=true will run in foreground and handle signals properly
# When supervisord gets SIGINT/SIGTERM, it will stop all child processes before exiting
proc = subprocess.Popen(
f"supervisord --configuration={CONFIG_FILE}",
stdin=None,
stdout=log_handle,
stderr=log_handle,
shell=True,
start_new_session=False, # Keep in same process group so signals propagate
)
def exit_signal_handler(signum, frame): # Store the process so we can wait on it later
if signum == 2: global _supervisord_proc
STDERR.print("\n[🛑] Got Ctrl+C. Terminating child processes...") _supervisord_proc = proc
elif signum != 13:
STDERR.print(f"\n[🦸‍♂️] Supervisord got stop signal ({signal.strsignal(signum)}). Terminating child processes...")
stop_existing_supervisord_process()
raise SystemExit(0)
# Monitor for termination signals and cleanup child processes # Wait a bit for supervisord to start up
if not daemonize: time.sleep(2)
try:
signal.signal(signal.SIGINT, exit_signal_handler)
signal.signal(signal.SIGHUP, exit_signal_handler)
signal.signal(signal.SIGPIPE, exit_signal_handler)
signal.signal(signal.SIGTERM, exit_signal_handler)
except Exception:
# signal handlers only work in main thread
pass
# otherwise supervisord will containue in background even if parent proc is ends (aka daemon mode)
time.sleep(2) return get_existing_supervisord_process()
return get_existing_supervisord_process()
def get_or_create_supervisord_process(daemonize=False): def get_or_create_supervisord_process(daemonize=False):
SOCK_FILE = get_sock_file() SOCK_FILE = get_sock_file()
@@ -353,9 +393,15 @@ def tail_worker_logs(log_path: str):
pass pass
def tail_multiple_worker_logs(log_files: list[str], follow=True): def tail_multiple_worker_logs(log_files: list[str], follow=True, proc=None):
"""Tail multiple log files simultaneously, interleaving their output.""" """Tail multiple log files simultaneously, interleaving their output.
import select
Args:
log_files: List of log file paths to tail
follow: Whether to keep following (True) or just read existing content (False)
proc: Optional subprocess.Popen object - stop tailing when this process exits
"""
import re
from pathlib import Path from pathlib import Path
# Convert relative paths to absolute paths # Convert relative paths to absolute paths
@@ -377,48 +423,53 @@ def tail_multiple_worker_logs(log_files: list[str], follow=True):
for log_path in log_paths: for log_path in log_paths:
try: try:
f = open(log_path, 'r') f = open(log_path, 'r')
# Seek to end of file if following # Don't seek to end - show recent content so user sees something
if follow: # Go to end minus 4KB to show some recent logs
f.seek(0, 2) # Seek to end f.seek(0, 2) # Go to end first
file_handles.append((log_path.name, f)) file_size = f.tell()
if file_size > 4096:
f.seek(file_size - 4096)
f.readline() # Skip partial line
else:
f.seek(0) # Small file, read from start
file_handles.append((log_path, f))
print(f" [tailing {log_path.name}]")
except Exception as e: except Exception as e:
print(f"[yellow]Warning: Could not open {log_path}: {e}[/yellow]") sys.stderr.write(f"Warning: Could not open {log_path}: {e}\n")
if not file_handles: if not file_handles:
print("[red]No log files could be opened[/red]") sys.stderr.write("No log files could be opened\n")
return return
# Print which logs we're tailing
log_names = [name for name, _ in file_handles]
print(f"[dim]Tailing: {', '.join(log_names)}[/dim]")
print() print()
try: try:
while follow: while follow:
# Read available lines from all files # Check if the monitored process has exited
for log_name, f in file_handles: if proc is not None and proc.poll() is not None:
line = f.readline() print(f"\n[server process exited with code {proc.returncode}]")
if line: break
# Colorize based on log source
if 'orchestrator' in log_name.lower():
color = 'cyan'
elif 'daphne' in log_name.lower():
color = 'green'
else:
color = 'white'
had_output = False
# Read ALL available lines from all files (not just one per iteration)
for log_path, f in file_handles:
while True:
line = f.readline()
if not line:
break # No more lines available in this file
had_output = True
# Strip ANSI codes if present (supervisord does this but just in case) # Strip ANSI codes if present (supervisord does this but just in case)
import re
line_clean = re.sub(r'\x1b\[[0-9;]*m', '', line.rstrip()) line_clean = re.sub(r'\x1b\[[0-9;]*m', '', line.rstrip())
if line_clean: if line_clean:
print(f'[{color}][{log_name}][/{color}] {line_clean}') print(line_clean)
# Small sleep to avoid busy-waiting # Small sleep to avoid busy-waiting (only when no output)
time.sleep(0.1) if not had_output:
time.sleep(0.05)
except (KeyboardInterrupt, BrokenPipeError, IOError): except (KeyboardInterrupt, BrokenPipeError, IOError):
print("\n[yellow][i] Stopped tailing logs[/i][/yellow]") pass # Let the caller handle the cleanup message
except SystemExit: except SystemExit:
pass pass
finally: finally:
@@ -451,6 +502,8 @@ def watch_worker(supervisor, daemon_name, interval=5):
def start_server_workers(host='0.0.0.0', port='8000', daemonize=False): def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
global _supervisord_proc
supervisor = get_or_create_supervisord_process(daemonize=daemonize) supervisor = get_or_create_supervisord_process(daemonize=daemonize)
bg_workers = [ bg_workers = [
@@ -466,36 +519,50 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
if not daemonize: if not daemonize:
try: try:
watch_worker(supervisor, "worker_daphne") # Tail worker logs while supervisord runs
sys.stdout.write('Tailing worker logs (Ctrl+C to stop)...\n\n')
sys.stdout.flush()
tail_multiple_worker_logs(
log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
follow=True,
proc=_supervisord_proc, # Stop tailing when supervisord exits
)
except (KeyboardInterrupt, BrokenPipeError, IOError): except (KeyboardInterrupt, BrokenPipeError, IOError):
STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...") STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
except SystemExit: except SystemExit:
pass pass
except BaseException as e: except BaseException as e:
STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...") STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping gracefully...")
raise
finally: finally:
stop_worker(supervisor, "worker_daphne") # Ensure supervisord and all children are stopped
stop_existing_supervisord_process()
time.sleep(0.5) time.sleep(0.5)
def start_cli_workers(watch=False): def start_cli_workers(watch=False):
global _supervisord_proc
supervisor = get_or_create_supervisord_process(daemonize=False) supervisor = get_or_create_supervisord_process(daemonize=False)
start_worker(supervisor, ORCHESTRATOR_WORKER) start_worker(supervisor, ORCHESTRATOR_WORKER)
if watch: if watch:
try: try:
watch_worker(supervisor, ORCHESTRATOR_WORKER['name']) # Block on supervisord process - it will handle signals and stop children
if _supervisord_proc:
_supervisord_proc.wait()
else:
# Fallback to watching worker if no proc reference
watch_worker(supervisor, ORCHESTRATOR_WORKER['name'])
except (KeyboardInterrupt, BrokenPipeError, IOError): except (KeyboardInterrupt, BrokenPipeError, IOError):
STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...") STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
except SystemExit: except SystemExit:
pass pass
except BaseException as e: except BaseException as e:
STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping orchestrator gracefully...") STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping gracefully...")
raise
finally: finally:
stop_worker(supervisor, ORCHESTRATOR_WORKER['name']) # Ensure supervisord and all children are stopped
stop_existing_supervisord_process()
time.sleep(0.5) time.sleep(0.5)
return [ORCHESTRATOR_WORKER] return [ORCHESTRATOR_WORKER]