remove Seed model in favor of Crawl as template

This commit is contained in:
Nick Sweeting
2025-12-25 01:52:38 -08:00
parent 28e6c5bb65
commit bb53228ebf
30 changed files with 785 additions and 690 deletions

View File

@@ -27,10 +27,9 @@ TYPE_SNAPSHOT = 'Snapshot'
TYPE_ARCHIVERESULT = 'ArchiveResult'
TYPE_TAG = 'Tag'
TYPE_CRAWL = 'Crawl'
TYPE_SEED = 'Seed'
TYPE_INSTALLEDBINARY = 'InstalledBinary'
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_SEED, TYPE_INSTALLEDBINARY}
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_INSTALLEDBINARY}
def parse_line(line: str) -> Optional[Dict[str, Any]]:
@@ -206,7 +205,8 @@ def crawl_to_jsonl(crawl) -> Dict[str, Any]:
return {
'type': TYPE_CRAWL,
'id': str(crawl.id),
'seed_id': str(crawl.seed_id),
'urls': crawl.urls,
'extractor': crawl.extractor,
'status': crawl.status,
'max_depth': crawl.max_depth,
'created_at': crawl.created_at.isoformat() if crawl.created_at else None,

View File

@@ -13,9 +13,11 @@ from rich.console import Console
from rich.highlighter import Highlighter
# SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
CONSOLE = Console()
STDERR = Console(stderr=True)
IS_TTY = CONSOLE.is_interactive
# Disable wrapping - use soft_wrap=True and large width so text flows naturally
# Colors are preserved, just no hard line breaks inserted
CONSOLE = Console(width=32768, soft_wrap=True, force_terminal=True)
STDERR = Console(stderr=True, width=32768, soft_wrap=True, force_terminal=True)
IS_TTY = sys.stdout.isatty()
class RainbowHighlighter(Highlighter):
def highlight(self, text):

View File

@@ -603,21 +603,17 @@ def log_worker_event(
# Build final message
error_str = f' {type(error).__name__}: {error}' if error else ''
# Build colored message - worker_label needs to be inside color tags
# But first we need to format the color tags separately from the worker label
from archivebox.misc.logging import CONSOLE
from rich.text import Text
# Create a Rich Text object for proper formatting
text = Text()
text.append(indent) # Indentation
# Append worker label and event with color
text.append(indent)
text.append(f'{worker_label} {event}{error_str}', style=color)
# Append metadata without color (add separator if metadata exists)
if metadata_str:
text.append(f' | {metadata_str}')
CONSOLE.print(text)
CONSOLE.print(text, soft_wrap=True)
@enforce_types

View File

@@ -1,7 +1,5 @@
__package__ = 'archivebox'
import sys
import shutil
import django
import pydantic
@@ -20,14 +18,10 @@ timezone.utc = datetime.timezone.utc
# DjangoSignalWebhooksConfig.verbose_name = 'API'
# Install rich for pretty tracebacks in console logs
# https://rich.readthedocs.io/en/stable/traceback.html#traceback-handler
from rich.traceback import install # noqa
TERM_WIDTH = (shutil.get_terminal_size((200, 10)).columns - 1) if sys.stdout.isatty() else 200
# os.environ.setdefault('COLUMNS', str(TERM_WIDTH))
install(show_locals=True, word_wrap=False, locals_max_length=10, locals_hide_dunder=True, suppress=[django, pydantic], extra_lines=2, width=TERM_WIDTH)
# Rich traceback handler disabled - it adds frames/boxes that wrap weirdly in log files
# Standard Python tracebacks are used instead (full width, no frames)
# from rich.traceback import install
# install(show_locals=True, word_wrap=False, ...)
# Hide site-packages/sonic/client.py:115: SyntaxWarning