new gallerydl plugin and more

2026-04-05 07:17:52 +10:00 · 2025-12-26 11:55:03 -08:00
parent 9838d7ba02
commit 4fd7fcdbcf
20 changed files with 3495 additions and 1435 deletions
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -66,6 +66,13 @@ def render_archiveresults_list(archiveresults_qs, limit=50):

        rows.append(f'''
            <tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
+                <td style="padding: 10px 12px; white-space: nowrap;">
+                    <a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
+                       style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 11px;"
+                       title="View/edit archive result">
+                        <code>{str(result.id)[:8]}</code>
+                    </a>
+                </td>
                <td style="padding: 10px 12px; white-space: nowrap;">
                    <span style="display: inline-block; padding: 3px 10px; border-radius: 12px;
                                 font-size: 11px; font-weight: 600; text-transform: uppercase;
@@ -75,7 +82,13 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
                    {icon}
                </td>
                <td style="padding: 10px 12px; font-weight: 500; color: #334155;">
-                    {result.extractor}
+                    <a href="{output_link}" target="_blank"
+                       style="color: #334155; text-decoration: none;"
+                       title="View output fullscreen"
+                       onmouseover="this.style.color='#2563eb'; this.style.textDecoration='underline';"
+                       onmouseout="this.style.color='#334155'; this.style.textDecoration='none';">
+                        {result.extractor}
+                    </a>
                </td>
                <td style="padding: 10px 12px; max-width: 280px;">
                    <span onclick="document.getElementById('{row_id}').open = !document.getElementById('{row_id}').open"
@@ -102,14 +115,14 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
                </td>
            </tr>
            <tr style="border-bottom: 1px solid #e2e8f0;">
-                <td colspan="7" style="padding: 0 12px 10px 12px;">
+                <td colspan="8" style="padding: 0 12px 10px 12px;">
                    <details id="{row_id}" style="margin: 0;">
                        <summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
                            Details &amp; Output
                        </summary>
                        <div style="margin-top: 8px; padding: 10px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; max-height: 200px; overflow: auto;">
                            <div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
-                                <span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)[:8]}...</code></span>
+                                <span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)}</code></span>
                                <span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
                                <span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
                            </div>
@@ -132,7 +145,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
    if total_count > limit:
        footer = f'''
            <tr>
-                <td colspan="7" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
+                <td colspan="8" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
                    Showing {limit} of {total_count} results &nbsp;
                    <a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
                       style="color: #2563eb;">View all →</a>
@@ -145,6 +158,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
            <table style="width: 100%; border-collapse: collapse; font-size: 14px;">
                <thead>
                    <tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">ID</th>
                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Extractor</th>
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -635,40 +635,143 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    # =========================================================================

    def canonical_outputs(self) -> Dict[str, Optional[str]]:
-        """Predict the expected output paths that should be present after archiving"""
+        """
+        Intelligently discover the best output file for each extractor.
+        Uses actual ArchiveResult data and filesystem scanning with smart heuristics.
+        """
        FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
+
+        # Mimetypes that can be embedded/previewed in an iframe
+        IFRAME_EMBEDDABLE_EXTENSIONS = {
+            'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl',
+            'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
+            'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
+        }
+
+        MIN_DISPLAY_SIZE = 15_000  # 15KB - filter out tiny files
+        MAX_SCAN_FILES = 50  # Don't scan massive directories
+
+        def find_best_output_in_dir(dir_path: Path, extractor_name: str) -> Optional[str]:
+            """Find the best representative file in an extractor's output directory"""
+            if not dir_path.exists() or not dir_path.is_dir():
+                return None
+
+            candidates = []
+            file_count = 0
+
+            # Special handling for media extractor - look for thumbnails
+            is_media_dir = extractor_name == 'media'
+
+            # Scan for suitable files
+            for file_path in dir_path.rglob('*'):
+                file_count += 1
+                if file_count > MAX_SCAN_FILES:
+                    break
+
+                if file_path.is_dir() or file_path.name.startswith('.'):
+                    continue
+
+                ext = file_path.suffix.lstrip('.').lower()
+                if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
+                    continue
+
+                try:
+                    size = file_path.stat().st_size
+                except OSError:
+                    continue
+
+                # For media dir, allow smaller image files (thumbnails are often < 15KB)
+                min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE
+                if size < min_size:
+                    continue
+
+                # Prefer main files: index.html, output.*, content.*, etc.
+                priority = 0
+                name_lower = file_path.name.lower()
+
+                if is_media_dir:
+                    # Special prioritization for media directories
+                    if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')):
+                        priority = 200  # Highest priority for thumbnails
+                    elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'):
+                        priority = 150  # High priority for any image
+                    elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'):
+                        priority = 100  # Lower priority for actual media files
+                    else:
+                        priority = 50
+                elif 'index' in name_lower:
+                    priority = 100
+                elif name_lower.startswith(('output', 'content', extractor_name)):
+                    priority = 50
+                elif ext in ('html', 'htm', 'pdf'):
+                    priority = 30
+                elif ext in ('png', 'jpg', 'jpeg', 'webp'):
+                    priority = 20
+                else:
+                    priority = 10
+
+                candidates.append((priority, size, file_path))
+
+            if not candidates:
+                return None
+
+            # Sort by priority (desc), then size (desc)
+            candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
+            best_file = candidates[0][2]
+            return str(best_file.relative_to(Path(self.output_dir)))
+
        canonical = {
            'index_path': 'index.html',
-            'favicon_path': 'favicon.ico',
            'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
-            'wget_path': f'warc/{self.timestamp}',
-            'warc_path': 'warc/',
-            'singlefile_path': 'singlefile.html',
-            'readability_path': 'readability/content.html',
-            'mercury_path': 'mercury/content.html',
-            'htmltotext_path': 'htmltotext.txt',
-            'pdf_path': 'output.pdf',
-            'screenshot_path': 'screenshot.png',
-            'dom_path': 'output.html',
            'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
-            'git_path': 'git/',
-            'media_path': 'media/',
-            'headers_path': 'headers.json',
        }

+        # Scan each ArchiveResult's output directory for the best file
+        snap_dir = Path(self.output_dir)
+        for result in self.archiveresult_set.filter(status='succeeded'):
+            if not result.output:
+                continue
+
+            # Try to find the best output file for this extractor
+            extractor_dir = snap_dir / result.extractor
+            best_output = None
+
+            if result.output and (snap_dir / result.output).exists():
+                # Use the explicit output path if it exists
+                best_output = result.output
+            elif extractor_dir.exists():
+                # Intelligently find the best file in the extractor's directory
+                best_output = find_best_output_in_dir(extractor_dir, result.extractor)
+
+            if best_output:
+                canonical[f'{result.extractor}_path'] = best_output
+
+        # Also scan top-level for legacy outputs (backwards compatibility)
+        for file_path in snap_dir.glob('*'):
+            if file_path.is_dir() or file_path.name in ('index.html', 'index.json'):
+                continue
+
+            ext = file_path.suffix.lstrip('.').lower()
+            if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
+                continue
+
+            try:
+                size = file_path.stat().st_size
+                if size >= MIN_DISPLAY_SIZE:
+                    # Add as generic output with stem as key
+                    key = f'{file_path.stem}_path'
+                    if key not in canonical:
+                        canonical[key] = file_path.name
+            except OSError:
+                continue
+
        if self.is_static:
            static_path = f'warc/{self.timestamp}'
            canonical.update({
                'title': self.basename,
                'wget_path': static_path,
-                'pdf_path': static_path,
-                'screenshot_path': static_path,
-                'dom_path': static_path,
-                'singlefile_path': static_path,
-                'readability_path': static_path,
-                'mercury_path': static_path,
-                'htmltotext_path': static_path,
            })
+
        return canonical

    def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -86,54 +86,37 @@ class SnapshotView(View):
                }
                archiveresults[result.extractor] = result_info

-        existing_files = {result['path'] for result in archiveresults.values()}
-        min_size_threshold = 10_000  # bytes
-        allowed_extensions = {
-            'txt',
-            'html',
-            'htm',
-            'png',
-            'jpg',
-            'jpeg',
-            'gif',
-            'webp'
-            'svg',
-            'webm',
-            'mp4',
-            'mp3',
-            'opus',
-            'pdf',
-            'md',
-        }
+        # Use canonical_outputs for intelligent discovery
+        # This method now scans ArchiveResults and uses smart heuristics
+        canonical = snapshot.canonical_outputs()

-
-        # iterate through all the files in the snapshot dir and add the biggest ones to the result list
+        # Add any newly discovered outputs from canonical_outputs to archiveresults
        snap_dir = Path(snapshot.output_dir)
-        if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
-            return {}
-
-        for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
-            extension = result_file.suffix.lstrip('.').lower()
-            if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions:
-                continue
-            if result_file.name in existing_files or result_file.name == 'index.html':
+        for key, path in canonical.items():
+            if not key.endswith('_path') or not path or path.startswith('http'):
+                continue
+
+            extractor_name = key.replace('_path', '')
+            if extractor_name in archiveresults:
+                continue  # Already have this from ArchiveResult
+
+            file_path = snap_dir / path
+            if not file_path.exists() or not file_path.is_file():
                continue

-            # Skip circular symlinks and other stat() failures
            try:
-                file_size = result_file.stat().st_size or 0
+                file_size = file_path.stat().st_size
+                if file_size >= 15_000:  # Only show files > 15KB
+                    archiveresults[extractor_name] = {
+                        'name': extractor_name,
+                        'path': path,
+                        'ts': ts_to_date_str(file_path.stat().st_mtime or 0),
+                        'size': file_size,
+                        'result': None,
+                    }
            except OSError:
                continue

-            if file_size > min_size_threshold:
-                archiveresults[result_file.name] = {
-                    'name': result_file.stem,
-                    'path': result_file.relative_to(snap_dir),
-                    'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
-                    'size': file_size,
-                    'result': None,  # No ArchiveResult object for filesystem-discovered files
-                }
-
        # Get available extractors from hooks (sorted by numeric prefix for ordering)
        # Convert to base names for display ordering
        all_extractors = [get_extractor_name(e) for e in get_extractors()]