new gallerydl plugin and more

This commit is contained in:
Nick Sweeting
2025-12-26 11:55:03 -08:00
parent 9838d7ba02
commit 4fd7fcdbcf
20 changed files with 3495 additions and 1435 deletions

View File

@@ -66,6 +66,13 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
rows.append(f'''
<tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
<td style="padding: 10px 12px; white-space: nowrap;">
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 11px;"
title="View/edit archive result">
<code>{str(result.id)[:8]}</code>
</a>
</td>
<td style="padding: 10px 12px; white-space: nowrap;">
<span style="display: inline-block; padding: 3px 10px; border-radius: 12px;
font-size: 11px; font-weight: 600; text-transform: uppercase;
@@ -75,7 +82,13 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
{icon}
</td>
<td style="padding: 10px 12px; font-weight: 500; color: #334155;">
{result.extractor}
<a href="{output_link}" target="_blank"
style="color: #334155; text-decoration: none;"
title="View output fullscreen"
onmouseover="this.style.color='#2563eb'; this.style.textDecoration='underline';"
onmouseout="this.style.color='#334155'; this.style.textDecoration='none';">
{result.extractor}
</a>
</td>
<td style="padding: 10px 12px; max-width: 280px;">
<span onclick="document.getElementById('{row_id}').open = !document.getElementById('{row_id}').open"
@@ -102,14 +115,14 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
</td>
</tr>
<tr style="border-bottom: 1px solid #e2e8f0;">
<td colspan="7" style="padding: 0 12px 10px 12px;">
<td colspan="8" style="padding: 0 12px 10px 12px;">
<details id="{row_id}" style="margin: 0;">
<summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
Details &amp; Output
</summary>
<div style="margin-top: 8px; padding: 10px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; max-height: 200px; overflow: auto;">
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
<span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)[:8]}...</code></span>
<span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)}</code></span>
<span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
<span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
</div>
@@ -132,7 +145,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
if total_count > limit:
footer = f'''
<tr>
<td colspan="7" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
<td colspan="8" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
Showing {limit} of {total_count} results &nbsp;
<a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
style="color: #2563eb;">View all →</a>
@@ -145,6 +158,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
<thead>
<tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">ID</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Extractor</th>

View File

@@ -635,40 +635,143 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# =========================================================================
def canonical_outputs(self) -> Dict[str, Optional[str]]:
"""Predict the expected output paths that should be present after archiving"""
"""
Intelligently discover the best output file for each extractor.
Uses actual ArchiveResult data and filesystem scanning with smart heuristics.
"""
FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
# Mimetypes that can be embedded/previewed in an iframe
IFRAME_EMBEDDABLE_EXTENSIONS = {
'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl',
'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
}
MIN_DISPLAY_SIZE = 15_000 # 15KB - filter out tiny files
MAX_SCAN_FILES = 50 # Don't scan massive directories
def find_best_output_in_dir(dir_path: Path, extractor_name: str) -> Optional[str]:
"""Find the best representative file in an extractor's output directory"""
if not dir_path.exists() or not dir_path.is_dir():
return None
candidates = []
file_count = 0
# Special handling for media extractor - look for thumbnails
is_media_dir = extractor_name == 'media'
# Scan for suitable files
for file_path in dir_path.rglob('*'):
file_count += 1
if file_count > MAX_SCAN_FILES:
break
if file_path.is_dir() or file_path.name.startswith('.'):
continue
ext = file_path.suffix.lstrip('.').lower()
if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
continue
try:
size = file_path.stat().st_size
except OSError:
continue
# For media dir, allow smaller image files (thumbnails are often < 15KB)
min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE
if size < min_size:
continue
# Prefer main files: index.html, output.*, content.*, etc.
priority = 0
name_lower = file_path.name.lower()
if is_media_dir:
# Special prioritization for media directories
if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')):
priority = 200 # Highest priority for thumbnails
elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'):
priority = 150 # High priority for any image
elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'):
priority = 100 # Lower priority for actual media files
else:
priority = 50
elif 'index' in name_lower:
priority = 100
elif name_lower.startswith(('output', 'content', extractor_name)):
priority = 50
elif ext in ('html', 'htm', 'pdf'):
priority = 30
elif ext in ('png', 'jpg', 'jpeg', 'webp'):
priority = 20
else:
priority = 10
candidates.append((priority, size, file_path))
if not candidates:
return None
# Sort by priority (desc), then size (desc)
candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
best_file = candidates[0][2]
return str(best_file.relative_to(Path(self.output_dir)))
canonical = {
'index_path': 'index.html',
'favicon_path': 'favicon.ico',
'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
'wget_path': f'warc/{self.timestamp}',
'warc_path': 'warc/',
'singlefile_path': 'singlefile.html',
'readability_path': 'readability/content.html',
'mercury_path': 'mercury/content.html',
'htmltotext_path': 'htmltotext.txt',
'pdf_path': 'output.pdf',
'screenshot_path': 'screenshot.png',
'dom_path': 'output.html',
'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
'git_path': 'git/',
'media_path': 'media/',
'headers_path': 'headers.json',
}
# Scan each ArchiveResult's output directory for the best file
snap_dir = Path(self.output_dir)
for result in self.archiveresult_set.filter(status='succeeded'):
if not result.output:
continue
# Try to find the best output file for this extractor
extractor_dir = snap_dir / result.extractor
best_output = None
if result.output and (snap_dir / result.output).exists():
# Use the explicit output path if it exists
best_output = result.output
elif extractor_dir.exists():
# Intelligently find the best file in the extractor's directory
best_output = find_best_output_in_dir(extractor_dir, result.extractor)
if best_output:
canonical[f'{result.extractor}_path'] = best_output
# Also scan top-level for legacy outputs (backwards compatibility)
for file_path in snap_dir.glob('*'):
if file_path.is_dir() or file_path.name in ('index.html', 'index.json'):
continue
ext = file_path.suffix.lstrip('.').lower()
if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
continue
try:
size = file_path.stat().st_size
if size >= MIN_DISPLAY_SIZE:
# Add as generic output with stem as key
key = f'{file_path.stem}_path'
if key not in canonical:
canonical[key] = file_path.name
except OSError:
continue
if self.is_static:
static_path = f'warc/{self.timestamp}'
canonical.update({
'title': self.basename,
'wget_path': static_path,
'pdf_path': static_path,
'screenshot_path': static_path,
'dom_path': static_path,
'singlefile_path': static_path,
'readability_path': static_path,
'mercury_path': static_path,
'htmltotext_path': static_path,
})
return canonical
def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:

View File

@@ -86,54 +86,37 @@ class SnapshotView(View):
}
archiveresults[result.extractor] = result_info
existing_files = {result['path'] for result in archiveresults.values()}
min_size_threshold = 10_000 # bytes
allowed_extensions = {
'txt',
'html',
'htm',
'png',
'jpg',
'jpeg',
'gif',
'webp'
'svg',
'webm',
'mp4',
'mp3',
'opus',
'pdf',
'md',
}
# Use canonical_outputs for intelligent discovery
# This method now scans ArchiveResults and uses smart heuristics
canonical = snapshot.canonical_outputs()
# iterate through all the files in the snapshot dir and add the biggest ones to the result list
# Add any newly discovered outputs from canonical_outputs to archiveresults
snap_dir = Path(snapshot.output_dir)
if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
return {}
for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
extension = result_file.suffix.lstrip('.').lower()
if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions:
continue
if result_file.name in existing_files or result_file.name == 'index.html':
for key, path in canonical.items():
if not key.endswith('_path') or not path or path.startswith('http'):
continue
extractor_name = key.replace('_path', '')
if extractor_name in archiveresults:
continue # Already have this from ArchiveResult
file_path = snap_dir / path
if not file_path.exists() or not file_path.is_file():
continue
# Skip circular symlinks and other stat() failures
try:
file_size = result_file.stat().st_size or 0
file_size = file_path.stat().st_size
if file_size >= 15_000: # Only show files > 15KB
archiveresults[extractor_name] = {
'name': extractor_name,
'path': path,
'ts': ts_to_date_str(file_path.stat().st_mtime or 0),
'size': file_size,
'result': None,
}
except OSError:
continue
if file_size > min_size_threshold:
archiveresults[result_file.name] = {
'name': result_file.stem,
'path': result_file.relative_to(snap_dir),
'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
'size': file_size,
'result': None, # No ArchiveResult object for filesystem-discovered files
}
# Get available extractors from hooks (sorted by numeric prefix for ordering)
# Convert to base names for display ordering
all_extractors = [get_extractor_name(e) for e in get_extractors()]