tons of fixes with codex

2026-04-06 07:47:53 +10:00 · 2026-01-19 01:00:53 -08:00
parent eaf7256345
commit c7b2217cd6
184 changed files with 3943 additions and 2420 deletions
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -344,6 +344,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    @property
    def process_set(self):
        """Get all Process objects related to this snapshot's ArchiveResults."""
+        import json
+        import json
        from archivebox.machine.models import Process
        return Process.objects.filter(archiveresult__snapshot_id=self.id)

@@ -613,7 +615,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

        ONLY used by: archivebox update (for orphan detection)
        """
-        import json
+        from archivebox.machine.models import Process

        # Try index.jsonl first (new format), then index.json (legacy)
        jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME
@@ -622,15 +624,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        data = None
        if jsonl_path.exists():
            try:
-                with open(jsonl_path) as f:
-                    for line in f:
-                        line = line.strip()
-                        if line.startswith('{'):
-                            record = json.loads(line)
-                            if record.get('type') == 'Snapshot':
-                                data = record
-                                break
-            except (json.JSONDecodeError, OSError):
+                records = Process.parse_records_from_text(jsonl_path.read_text())
+                for record in records:
+                    if record.get('type') == 'Snapshot':
+                        data = record
+                        break
+            except OSError:
                pass
        elif json_path.exists():
            try:
@@ -689,7 +688,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

        ONLY used by: archivebox update (for orphan import)
        """
-        import json
+        from archivebox.machine.models import Process

        # Try index.jsonl first (new format), then index.json (legacy)
        jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME
@@ -698,15 +697,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        data = None
        if jsonl_path.exists():
            try:
-                with open(jsonl_path) as f:
-                    for line in f:
-                        line = line.strip()
-                        if line.startswith('{'):
-                            record = json.loads(line)
-                            if record.get('type') == 'Snapshot':
-                                data = record
-                                break
-            except (json.JSONDecodeError, OSError):
+                records = Process.parse_records_from_text(jsonl_path.read_text())
+                for record in records:
+                    if record.get('type') == 'Snapshot':
+                        data = record
+                        break
+            except OSError:
                pass
        elif json_path.exists():
            try:
@@ -1040,7 +1036,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

        Returns dict with keys: 'snapshot', 'archive_results', 'binaries', 'processes'
        """
-        import json
+        from archivebox.machine.models import Process
        from archivebox.misc.jsonl import (
            TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_BINARY, TYPE_PROCESS,
        )
@@ -1056,24 +1052,17 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        if not index_path.exists():
            return result

-        with open(index_path, 'r') as f:
-            for line in f:
-                line = line.strip()
-                if not line or not line.startswith('{'):
-                    continue
-                try:
-                    record = json.loads(line)
-                    record_type = record.get('type')
-                    if record_type == TYPE_SNAPSHOT:
-                        result['snapshot'] = record
-                    elif record_type == TYPE_ARCHIVERESULT:
-                        result['archive_results'].append(record)
-                    elif record_type == TYPE_BINARY:
-                        result['binaries'].append(record)
-                    elif record_type == TYPE_PROCESS:
-                        result['processes'].append(record)
-                except json.JSONDecodeError:
-                    continue
+        records = Process.parse_records_from_text(index_path.read_text())
+        for record in records:
+            record_type = record.get('type')
+            if record_type == TYPE_SNAPSHOT:
+                result['snapshot'] = record
+            elif record_type == TYPE_ARCHIVERESULT:
+                result['archive_results'].append(record)
+            elif record_type == TYPE_BINARY:
+                result['binaries'].append(record)
+            elif record_type == TYPE_PROCESS:
+                result['processes'].append(record)

        return result

@@ -1317,7 +1306,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
            for plugin in all_plugins:
                result = archive_results.get(plugin)
                existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
-                icon = get_plugin_icon(plugin)
+                icon = mark_safe(get_plugin_icon(plugin))

                # Skip plugins with empty icons that have no output
                # (e.g., staticfile only shows when there's actual output)
@@ -1373,6 +1362,45 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

        return str(current_path)

+    def ensure_crawl_symlink(self) -> None:
+        """Ensure snapshot is symlinked under its crawl output directory."""
+        import os
+        from pathlib import Path
+        from django.utils import timezone
+        from archivebox import DATA_DIR
+        from archivebox.crawls.models import Crawl
+
+        if not self.crawl_id:
+            return
+        crawl = Crawl.objects.filter(id=self.crawl_id).select_related('created_by').first()
+        if not crawl:
+            return
+
+        date_base = crawl.created_at or self.created_at or timezone.now()
+        date_str = date_base.strftime('%Y%m%d')
+        domain = self.extract_domain_from_url(self.url)
+        username = crawl.created_by.username if crawl.created_by_id else 'system'
+
+        crawl_dir = DATA_DIR / 'users' / username / 'crawls' / date_str / domain / str(crawl.id)
+        link_path = crawl_dir / 'snapshots' / domain / str(self.id)
+        link_parent = link_path.parent
+        link_parent.mkdir(parents=True, exist_ok=True)
+
+        target = Path(self.output_dir)
+        if link_path.exists() or link_path.is_symlink():
+            if link_path.is_symlink():
+                if link_path.resolve() == target.resolve():
+                    return
+                link_path.unlink(missing_ok=True)
+            else:
+                return
+
+        rel_target = os.path.relpath(target, link_parent)
+        try:
+            link_path.symlink_to(rel_target, target_is_directory=True)
+        except OSError:
+            return
+
    @cached_property
    def archive_path(self):
        return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'
@@ -1636,6 +1664,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        if update_fields:
            snapshot.save(update_fields=update_fields + ['modified_at'])

+        snapshot.ensure_crawl_symlink()
+
        return snapshot

    def create_pending_archiveresults(self) -> list['ArchiveResult']:
@@ -1689,7 +1719,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        """
        # Check if any ARs are still pending/started
        pending = self.archiveresult_set.exclude(
-            status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES
+            status__in=ArchiveResult.FINAL_STATES
        ).exists()

        return not pending
@@ -1754,7 +1784,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        - Plugins run in order (numeric prefix)
        - Each plugin checks its dependencies at runtime

-        Dependency handling (e.g., chrome_session → screenshot):
+        Dependency handling (e.g., chrome → screenshot):
        - Plugins check if required outputs exist before running
        - If dependency output missing → plugin returns 'skipped'
        - On retry, if dependency now succeeds → dependent can run
@@ -2117,6 +2147,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        TITLE_LOADING_MSG = 'Not yet archived...'

        canonical = self.canonical_outputs()
+        preview_priority = [
+            'singlefile_path',
+            'screenshot_path',
+            'wget_path',
+            'dom_path',
+            'pdf_path',
+            'readability_path',
+        ]
+        best_preview_path = next(
+            (canonical.get(key) for key in preview_priority if canonical.get(key)),
+            canonical.get('index_path', 'index.html'),
+        )
        context = {
            **self.to_dict(extended=True),
            **{f'{k}_path': v for k, v in canonical.items()},
@@ -2132,6 +2174,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
            'oldest_archive_date': ts_to_date_str(self.oldest_archive_date),
            'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
            'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
+            'best_preview_path': best_preview_path,
        }
        rendered_html = render_to_string('snapshot.html', context)
        atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
@@ -2669,12 +2712,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        - end_ts, retry_at, cmd, cmd_version, binary FK
        - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
        """
-        import json
        import mimetypes
        from collections import defaultdict
        from pathlib import Path
        from django.utils import timezone
-        from archivebox.hooks import process_hook_records
+        from archivebox.hooks import process_hook_records, extract_records_from_process
+        from archivebox.machine.models import Process

        plugin_dir = Path(self.pwd) if self.pwd else None
        if not plugin_dir or not plugin_dir.exists():
@@ -2687,15 +2730,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi

        # Read and parse JSONL output from stdout.log
        stdout_file = plugin_dir / 'stdout.log'
-        stdout = stdout_file.read_text() if stdout_file.exists() else ''
-
        records = []
-        for line in stdout.splitlines():
-            if line.strip() and line.strip().startswith('{'):
-                try:
-                    records.append(json.loads(line))
-                except json.JSONDecodeError:
-                    continue
+        if self.process_id and self.process:
+            records = extract_records_from_process(self.process)
+
+        if not records:
+            stdout = stdout_file.read_text() if stdout_file.exists() else ''
+            records = Process.parse_records_from_text(stdout)

        # Find ArchiveResult record and update status/output from it
        ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
@@ -2722,9 +2763,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
                self._set_binary_from_cmd(hook_data['cmd'])
            # Note: cmd_version is derived from binary.version, not stored on Process
        else:
-            # No ArchiveResult record = failed
-            self.status = self.StatusChoices.FAILED
-            self.output_str = 'Hook did not output ArchiveResult record'
+            # No ArchiveResult record: treat background hooks or clean exits as skipped
+            is_background = False
+            try:
+                from archivebox.hooks import is_background_hook
+                is_background = bool(self.hook_name and is_background_hook(self.hook_name))
+            except Exception:
+                pass
+
+            if is_background or (self.process_id and self.process and self.process.exit_code == 0):
+                self.status = self.StatusChoices.SKIPPED
+                self.output_str = 'Hook did not output ArchiveResult record'
+            else:
+                self.status = self.StatusChoices.FAILED
+                self.output_str = 'Hook did not output ArchiveResult record'

        # Walk filesystem and populate output_files, output_size, output_mimetypes
        exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
@@ -2793,14 +2845,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        }
        process_hook_records(filtered_records, overrides=overrides)

-        # Cleanup PID files and empty logs
+        # Cleanup PID files (keep logs even if empty so they can be tailed)
        pid_file = plugin_dir / 'hook.pid'
        pid_file.unlink(missing_ok=True)
-        stderr_file = plugin_dir / 'stderr.log'
-        if stdout_file.exists() and stdout_file.stat().st_size == 0:
-            stdout_file.unlink()
-        if stderr_file.exists() and stderr_file.stat().st_size == 0:
-            stderr_file.unlink()

    def _set_binary_from_cmd(self, cmd: list) -> None:
        """
@@ -3186,4 +3233,4 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
 # Manually register state machines with python-statemachine registry
 # (normally auto-discovered from statemachines.py, but we define them here for clarity)
 registry.register(SnapshotMachine)
-registry.register(ArchiveResultMachine)
+registry.register(ArchiveResultMachine)
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -436,6 +436,10 @@ SIGNAL_WEBHOOKS = {
    },
 }

+# Avoid background threads touching sqlite connections (especially during tests/migrations).
+if DATABASES["default"]["ENGINE"].endswith("sqlite3"):
+    SIGNAL_WEBHOOKS["TASK_HANDLER"] = "signal_webhooks.handlers.sync_task_handler"
+
 ################################################################################
 ### Admin Data View Settings
 ################################################################################
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -120,7 +120,15 @@ class SnapshotView(View):
        # Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
        # Convert to base names for display ordering
        all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()]
-        preferred_types = tuple(all_plugins)
+        preview_priority = [
+            'singlefile',
+            'screenshot',
+            'wget',
+            'dom',
+            'pdf',
+            'readability',
+        ]
+        preferred_types = tuple(preview_priority + [p for p in all_plugins if p not in preview_priority])
        all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)

        best_result = {'path': 'None', 'result': None}