mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 07:17:52 +10:00
new gallerydl plugin and more
This commit is contained in:
@@ -66,6 +66,13 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
|
||||
rows.append(f'''
|
||||
<tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
|
||||
<td style="padding: 10px 12px; white-space: nowrap;">
|
||||
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
|
||||
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 11px;"
|
||||
title="View/edit archive result">
|
||||
<code>{str(result.id)[:8]}</code>
|
||||
</a>
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap;">
|
||||
<span style="display: inline-block; padding: 3px 10px; border-radius: 12px;
|
||||
font-size: 11px; font-weight: 600; text-transform: uppercase;
|
||||
@@ -75,7 +82,13 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
{icon}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; font-weight: 500; color: #334155;">
|
||||
{result.extractor}
|
||||
<a href="{output_link}" target="_blank"
|
||||
style="color: #334155; text-decoration: none;"
|
||||
title="View output fullscreen"
|
||||
onmouseover="this.style.color='#2563eb'; this.style.textDecoration='underline';"
|
||||
onmouseout="this.style.color='#334155'; this.style.textDecoration='none';">
|
||||
{result.extractor}
|
||||
</a>
|
||||
</td>
|
||||
<td style="padding: 10px 12px; max-width: 280px;">
|
||||
<span onclick="document.getElementById('{row_id}').open = !document.getElementById('{row_id}').open"
|
||||
@@ -102,14 +115,14 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
</td>
|
||||
</tr>
|
||||
<tr style="border-bottom: 1px solid #e2e8f0;">
|
||||
<td colspan="7" style="padding: 0 12px 10px 12px;">
|
||||
<td colspan="8" style="padding: 0 12px 10px 12px;">
|
||||
<details id="{row_id}" style="margin: 0;">
|
||||
<summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
|
||||
Details & Output
|
||||
</summary>
|
||||
<div style="margin-top: 8px; padding: 10px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; max-height: 200px; overflow: auto;">
|
||||
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
|
||||
<span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)[:8]}...</code></span>
|
||||
<span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)}</code></span>
|
||||
<span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
|
||||
<span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
|
||||
</div>
|
||||
@@ -132,7 +145,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
if total_count > limit:
|
||||
footer = f'''
|
||||
<tr>
|
||||
<td colspan="7" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
|
||||
<td colspan="8" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
|
||||
Showing {limit} of {total_count} results
|
||||
<a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
|
||||
style="color: #2563eb;">View all →</a>
|
||||
@@ -145,6 +158,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
|
||||
<thead>
|
||||
<tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">ID</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Extractor</th>
|
||||
|
||||
@@ -635,40 +635,143 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
# =========================================================================
|
||||
|
||||
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
||||
"""Predict the expected output paths that should be present after archiving"""
|
||||
"""
|
||||
Intelligently discover the best output file for each extractor.
|
||||
Uses actual ArchiveResult data and filesystem scanning with smart heuristics.
|
||||
"""
|
||||
FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
|
||||
|
||||
# Mimetypes that can be embedded/previewed in an iframe
|
||||
IFRAME_EMBEDDABLE_EXTENSIONS = {
|
||||
'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl',
|
||||
'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
|
||||
'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
|
||||
}
|
||||
|
||||
MIN_DISPLAY_SIZE = 15_000 # 15KB - filter out tiny files
|
||||
MAX_SCAN_FILES = 50 # Don't scan massive directories
|
||||
|
||||
def find_best_output_in_dir(dir_path: Path, extractor_name: str) -> Optional[str]:
|
||||
"""Find the best representative file in an extractor's output directory"""
|
||||
if not dir_path.exists() or not dir_path.is_dir():
|
||||
return None
|
||||
|
||||
candidates = []
|
||||
file_count = 0
|
||||
|
||||
# Special handling for media extractor - look for thumbnails
|
||||
is_media_dir = extractor_name == 'media'
|
||||
|
||||
# Scan for suitable files
|
||||
for file_path in dir_path.rglob('*'):
|
||||
file_count += 1
|
||||
if file_count > MAX_SCAN_FILES:
|
||||
break
|
||||
|
||||
if file_path.is_dir() or file_path.name.startswith('.'):
|
||||
continue
|
||||
|
||||
ext = file_path.suffix.lstrip('.').lower()
|
||||
if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
|
||||
continue
|
||||
|
||||
try:
|
||||
size = file_path.stat().st_size
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
# For media dir, allow smaller image files (thumbnails are often < 15KB)
|
||||
min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE
|
||||
if size < min_size:
|
||||
continue
|
||||
|
||||
# Prefer main files: index.html, output.*, content.*, etc.
|
||||
priority = 0
|
||||
name_lower = file_path.name.lower()
|
||||
|
||||
if is_media_dir:
|
||||
# Special prioritization for media directories
|
||||
if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')):
|
||||
priority = 200 # Highest priority for thumbnails
|
||||
elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'):
|
||||
priority = 150 # High priority for any image
|
||||
elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'):
|
||||
priority = 100 # Lower priority for actual media files
|
||||
else:
|
||||
priority = 50
|
||||
elif 'index' in name_lower:
|
||||
priority = 100
|
||||
elif name_lower.startswith(('output', 'content', extractor_name)):
|
||||
priority = 50
|
||||
elif ext in ('html', 'htm', 'pdf'):
|
||||
priority = 30
|
||||
elif ext in ('png', 'jpg', 'jpeg', 'webp'):
|
||||
priority = 20
|
||||
else:
|
||||
priority = 10
|
||||
|
||||
candidates.append((priority, size, file_path))
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
# Sort by priority (desc), then size (desc)
|
||||
candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
|
||||
best_file = candidates[0][2]
|
||||
return str(best_file.relative_to(Path(self.output_dir)))
|
||||
|
||||
canonical = {
|
||||
'index_path': 'index.html',
|
||||
'favicon_path': 'favicon.ico',
|
||||
'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
|
||||
'wget_path': f'warc/{self.timestamp}',
|
||||
'warc_path': 'warc/',
|
||||
'singlefile_path': 'singlefile.html',
|
||||
'readability_path': 'readability/content.html',
|
||||
'mercury_path': 'mercury/content.html',
|
||||
'htmltotext_path': 'htmltotext.txt',
|
||||
'pdf_path': 'output.pdf',
|
||||
'screenshot_path': 'screenshot.png',
|
||||
'dom_path': 'output.html',
|
||||
'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
|
||||
'git_path': 'git/',
|
||||
'media_path': 'media/',
|
||||
'headers_path': 'headers.json',
|
||||
}
|
||||
|
||||
# Scan each ArchiveResult's output directory for the best file
|
||||
snap_dir = Path(self.output_dir)
|
||||
for result in self.archiveresult_set.filter(status='succeeded'):
|
||||
if not result.output:
|
||||
continue
|
||||
|
||||
# Try to find the best output file for this extractor
|
||||
extractor_dir = snap_dir / result.extractor
|
||||
best_output = None
|
||||
|
||||
if result.output and (snap_dir / result.output).exists():
|
||||
# Use the explicit output path if it exists
|
||||
best_output = result.output
|
||||
elif extractor_dir.exists():
|
||||
# Intelligently find the best file in the extractor's directory
|
||||
best_output = find_best_output_in_dir(extractor_dir, result.extractor)
|
||||
|
||||
if best_output:
|
||||
canonical[f'{result.extractor}_path'] = best_output
|
||||
|
||||
# Also scan top-level for legacy outputs (backwards compatibility)
|
||||
for file_path in snap_dir.glob('*'):
|
||||
if file_path.is_dir() or file_path.name in ('index.html', 'index.json'):
|
||||
continue
|
||||
|
||||
ext = file_path.suffix.lstrip('.').lower()
|
||||
if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
|
||||
continue
|
||||
|
||||
try:
|
||||
size = file_path.stat().st_size
|
||||
if size >= MIN_DISPLAY_SIZE:
|
||||
# Add as generic output with stem as key
|
||||
key = f'{file_path.stem}_path'
|
||||
if key not in canonical:
|
||||
canonical[key] = file_path.name
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
if self.is_static:
|
||||
static_path = f'warc/{self.timestamp}'
|
||||
canonical.update({
|
||||
'title': self.basename,
|
||||
'wget_path': static_path,
|
||||
'pdf_path': static_path,
|
||||
'screenshot_path': static_path,
|
||||
'dom_path': static_path,
|
||||
'singlefile_path': static_path,
|
||||
'readability_path': static_path,
|
||||
'mercury_path': static_path,
|
||||
'htmltotext_path': static_path,
|
||||
})
|
||||
|
||||
return canonical
|
||||
|
||||
def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
|
||||
|
||||
@@ -86,54 +86,37 @@ class SnapshotView(View):
|
||||
}
|
||||
archiveresults[result.extractor] = result_info
|
||||
|
||||
existing_files = {result['path'] for result in archiveresults.values()}
|
||||
min_size_threshold = 10_000 # bytes
|
||||
allowed_extensions = {
|
||||
'txt',
|
||||
'html',
|
||||
'htm',
|
||||
'png',
|
||||
'jpg',
|
||||
'jpeg',
|
||||
'gif',
|
||||
'webp'
|
||||
'svg',
|
||||
'webm',
|
||||
'mp4',
|
||||
'mp3',
|
||||
'opus',
|
||||
'pdf',
|
||||
'md',
|
||||
}
|
||||
# Use canonical_outputs for intelligent discovery
|
||||
# This method now scans ArchiveResults and uses smart heuristics
|
||||
canonical = snapshot.canonical_outputs()
|
||||
|
||||
|
||||
# iterate through all the files in the snapshot dir and add the biggest ones to the result list
|
||||
# Add any newly discovered outputs from canonical_outputs to archiveresults
|
||||
snap_dir = Path(snapshot.output_dir)
|
||||
if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
|
||||
return {}
|
||||
|
||||
for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
|
||||
extension = result_file.suffix.lstrip('.').lower()
|
||||
if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions:
|
||||
continue
|
||||
if result_file.name in existing_files or result_file.name == 'index.html':
|
||||
for key, path in canonical.items():
|
||||
if not key.endswith('_path') or not path or path.startswith('http'):
|
||||
continue
|
||||
|
||||
extractor_name = key.replace('_path', '')
|
||||
if extractor_name in archiveresults:
|
||||
continue # Already have this from ArchiveResult
|
||||
|
||||
file_path = snap_dir / path
|
||||
if not file_path.exists() or not file_path.is_file():
|
||||
continue
|
||||
|
||||
# Skip circular symlinks and other stat() failures
|
||||
try:
|
||||
file_size = result_file.stat().st_size or 0
|
||||
file_size = file_path.stat().st_size
|
||||
if file_size >= 15_000: # Only show files > 15KB
|
||||
archiveresults[extractor_name] = {
|
||||
'name': extractor_name,
|
||||
'path': path,
|
||||
'ts': ts_to_date_str(file_path.stat().st_mtime or 0),
|
||||
'size': file_size,
|
||||
'result': None,
|
||||
}
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
if file_size > min_size_threshold:
|
||||
archiveresults[result_file.name] = {
|
||||
'name': result_file.stem,
|
||||
'path': result_file.relative_to(snap_dir),
|
||||
'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
|
||||
'size': file_size,
|
||||
'result': None, # No ArchiveResult object for filesystem-discovered files
|
||||
}
|
||||
|
||||
# Get available extractors from hooks (sorted by numeric prefix for ordering)
|
||||
# Convert to base names for display ordering
|
||||
all_extractors = [get_extractor_name(e) for e in get_extractors()]
|
||||
|
||||
Reference in New Issue
Block a user