mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
cleanup tui, startup, card templtes, and more
This commit is contained in:
5
.claude/settings.json
Normal file
5
.claude/settings.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"enabledPlugins": {
|
||||
"pyright-lsp@claude-plugins-official": true
|
||||
}
|
||||
}
|
||||
@@ -30,10 +30,13 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
check_data_folder()
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
|
||||
run_in_debug = SHELL_CONFIG.DEBUG or debug or reload
|
||||
if debug or reload:
|
||||
SHELL_CONFIG.DEBUG = True
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
print()
|
||||
@@ -56,7 +59,8 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
if SHELL_CONFIG.DEBUG:
|
||||
if run_in_debug:
|
||||
from django.core.management import call_command
|
||||
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
|
||||
@@ -57,7 +57,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
|
||||
# Build output link - use embed_path() which checks output_files first
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
output_link = f'/archive/{result.snapshot.timestamp}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
|
||||
output_link = f'/{result.snapshot.archive_path}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/{result.snapshot.archive_path}/'
|
||||
|
||||
# Get version - try cmd_version field
|
||||
version = result.cmd_version if result.cmd_version else '-'
|
||||
@@ -83,8 +83,8 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
{icon}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; font-weight: 500; color: #334155;">
|
||||
<a href="{output_link}" target="_blank"
|
||||
style="color: #334155; text-decoration: none;"
|
||||
<a href="{output_link}" target="_blank"
|
||||
style="color: #334155; text-decoration: none;"
|
||||
title="View output fullscreen"
|
||||
onmouseover="this.style.color='#2563eb'; this.style.textDecoration='underline';"
|
||||
onmouseout="this.style.color='#334155'; this.style.textDecoration='none';">
|
||||
@@ -301,8 +301,8 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
)
|
||||
def snapshot_info(self, result):
|
||||
return format_html(
|
||||
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b> {} {}</a><br/>',
|
||||
result.snapshot.timestamp,
|
||||
'<a href="/{}/index.html"><b><code>[{}]</code></b> {} {}</a><br/>',
|
||||
result.snapshot.archive_path,
|
||||
str(result.snapshot.id)[:8],
|
||||
result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'),
|
||||
result.snapshot.url[:128],
|
||||
@@ -336,8 +336,8 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
|
||||
return format_html(
|
||||
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
|
||||
result.snapshot.timestamp,
|
||||
'<a href="/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
|
||||
result.snapshot.archive_path,
|
||||
output_path,
|
||||
result.output_str,
|
||||
)
|
||||
@@ -348,7 +348,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
'<pre style="display: inline-block">{}</pre><br/>',
|
||||
result.output_str,
|
||||
)
|
||||
output_html += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
|
||||
output_html += format_html('<a href="/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.archive_path))
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
|
||||
path_from_embed = (snapshot_dir / (embed_path or ''))
|
||||
output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))
|
||||
|
||||
@@ -237,13 +237,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
'''
|
||||
<div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/archive/{}"
|
||||
href="/{}"
|
||||
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||
📄 Summary Page
|
||||
</a>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/archive/{}/index.html#all"
|
||||
href="/{}/index.html#all"
|
||||
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||
📁 Result Files
|
||||
@@ -291,8 +291,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
<b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
|
||||
</p>
|
||||
''',
|
||||
obj.timestamp,
|
||||
obj.timestamp,
|
||||
obj.archive_path,
|
||||
obj.archive_path,
|
||||
obj.url,
|
||||
obj.pk,
|
||||
obj.pk,
|
||||
@@ -310,7 +310,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
'✅' if obj.is_archived else '❌',
|
||||
obj.num_outputs,
|
||||
self.size(obj) or '0kb',
|
||||
f'/archive/{obj.timestamp}/favicon.ico',
|
||||
f'/{obj.archive_path}/favicon.ico',
|
||||
obj.extension or '-',
|
||||
)
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
import ipaddress
|
||||
import re
|
||||
from django.utils import timezone
|
||||
from django.contrib.auth.middleware import RemoteUserMiddleware
|
||||
from django.core.exceptions import ImproperlyConfigured
|
||||
@@ -28,10 +29,11 @@ def TimezoneMiddleware(get_response):
|
||||
|
||||
|
||||
def CacheControlMiddleware(get_response):
|
||||
snapshot_path_re = re.compile(r"^/[^/]+/\\d{8}/[^/]+/[0-9a-fA-F-]{8,36}/")
|
||||
def middleware(request):
|
||||
response = get_response(request)
|
||||
|
||||
if '/archive/' in request.path or '/static/' in request.path:
|
||||
if '/archive/' in request.path or '/static/' in request.path or snapshot_path_re.match(request.path):
|
||||
policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
|
||||
response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
|
||||
# print('Set Cache-Control header to', response['Cache-Control'])
|
||||
|
||||
@@ -1296,7 +1296,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
)}
|
||||
|
||||
path = self.archive_path
|
||||
canon = self.canonical_outputs()
|
||||
output = ""
|
||||
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> '
|
||||
|
||||
@@ -1313,10 +1312,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
if not icon.strip() and not existing:
|
||||
continue
|
||||
|
||||
embed_path = result.embed_path() if result else f'{plugin}/'
|
||||
output += format_html(
|
||||
output_template,
|
||||
path,
|
||||
canon.get(plugin, plugin + '/'),
|
||||
embed_path,
|
||||
str(bool(existing)),
|
||||
plugin,
|
||||
icon
|
||||
@@ -1402,9 +1402,38 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return
|
||||
|
||||
@cached_property
|
||||
def archive_path(self):
|
||||
def legacy_archive_path(self) -> str:
|
||||
return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'
|
||||
|
||||
@cached_property
|
||||
def url_path(self) -> str:
|
||||
"""URL path matching the current snapshot output_dir layout."""
|
||||
try:
|
||||
rel_path = Path(self.output_dir).resolve().relative_to(CONSTANTS.DATA_DIR)
|
||||
except Exception:
|
||||
return self.legacy_archive_path
|
||||
|
||||
parts = rel_path.parts
|
||||
# New layout: users/<username>/snapshots/<YYYYMMDD>/<domain>/<uuid>/
|
||||
if len(parts) >= 6 and parts[0] == 'users' and parts[2] == 'snapshots':
|
||||
username = parts[1]
|
||||
if username == 'system':
|
||||
username = 'web'
|
||||
date_str = parts[3]
|
||||
domain = parts[4]
|
||||
snapshot_id = parts[5]
|
||||
return f'{username}/{date_str}/{domain}/{snapshot_id}'
|
||||
|
||||
# Legacy layout: archive/<timestamp>/
|
||||
if len(parts) >= 2 and parts[0] == CONSTANTS.ARCHIVE_DIR_NAME:
|
||||
return f'{parts[0]}/{parts[1]}'
|
||||
|
||||
return '/'.join(parts)
|
||||
|
||||
@cached_property
|
||||
def archive_path(self):
|
||||
return self.url_path
|
||||
|
||||
@cached_property
|
||||
def archive_size(self):
|
||||
try:
|
||||
@@ -1467,8 +1496,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
for pid_file in Path(self.output_dir).glob('**/*.pid'):
|
||||
pid_file.unlink(missing_ok=True)
|
||||
|
||||
# Update all STARTED ArchiveResults from filesystem
|
||||
results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
|
||||
# Update all background ArchiveResults from filesystem (in case output arrived late)
|
||||
results = self.archiveresult_set.filter(hook_name__contains='.bg.')
|
||||
for ar in results:
|
||||
ar.update_from_output()
|
||||
|
||||
@@ -1914,153 +1943,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
# Output Path Methods (migrated from Link schema)
|
||||
# =========================================================================
|
||||
|
||||
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
||||
"""
|
||||
Intelligently discover the best output file for each plugin.
|
||||
Uses actual ArchiveResult data and filesystem scanning with smart heuristics.
|
||||
"""
|
||||
FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
|
||||
|
||||
# Mimetypes that can be embedded/previewed in an iframe
|
||||
IFRAME_EMBEDDABLE_EXTENSIONS = {
|
||||
'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl',
|
||||
'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
|
||||
'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
|
||||
}
|
||||
|
||||
MIN_DISPLAY_SIZE = 15_000 # 15KB - filter out tiny files
|
||||
MAX_SCAN_FILES = 50 # Don't scan massive directories
|
||||
|
||||
def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]:
|
||||
"""Find the best representative file in a plugin's output directory"""
|
||||
if not dir_path.exists() or not dir_path.is_dir():
|
||||
return None
|
||||
|
||||
candidates = []
|
||||
file_count = 0
|
||||
|
||||
# Special handling for media plugin - look for thumbnails
|
||||
is_media_dir = plugin_name == 'media'
|
||||
|
||||
# Scan for suitable files
|
||||
for file_path in dir_path.rglob('*'):
|
||||
file_count += 1
|
||||
if file_count > MAX_SCAN_FILES:
|
||||
break
|
||||
|
||||
if file_path.is_dir() or file_path.name.startswith('.'):
|
||||
continue
|
||||
|
||||
ext = file_path.suffix.lstrip('.').lower()
|
||||
if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
|
||||
continue
|
||||
|
||||
try:
|
||||
size = file_path.stat().st_size
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
# For media dir, allow smaller image files (thumbnails are often < 15KB)
|
||||
min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE
|
||||
if size < min_size:
|
||||
continue
|
||||
|
||||
# Prefer main files: index.html, output.*, content.*, etc.
|
||||
priority = 0
|
||||
name_lower = file_path.name.lower()
|
||||
|
||||
if is_media_dir:
|
||||
# Special prioritization for media directories
|
||||
if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')):
|
||||
priority = 200 # Highest priority for thumbnails
|
||||
elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'):
|
||||
priority = 150 # High priority for any image
|
||||
elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'):
|
||||
priority = 100 # Lower priority for actual media files
|
||||
else:
|
||||
priority = 50
|
||||
elif 'index' in name_lower:
|
||||
priority = 100
|
||||
elif name_lower.startswith(('output', 'content', plugin_name)):
|
||||
priority = 50
|
||||
elif ext in ('html', 'htm', 'pdf'):
|
||||
priority = 30
|
||||
elif ext in ('png', 'jpg', 'jpeg', 'webp'):
|
||||
priority = 20
|
||||
else:
|
||||
priority = 10
|
||||
|
||||
candidates.append((priority, size, file_path))
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
# Sort by priority (desc), then size (desc)
|
||||
candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
|
||||
best_file = candidates[0][2]
|
||||
return str(best_file.relative_to(Path(self.output_dir)))
|
||||
|
||||
canonical = {
|
||||
'index_path': 'index.html',
|
||||
'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
|
||||
'archivedotorg_path': f'https://web.archive.org/web/{self.base_url}',
|
||||
}
|
||||
|
||||
# Scan each ArchiveResult's output directory for the best file
|
||||
snap_dir = Path(self.output_dir)
|
||||
for result in self.archiveresult_set.filter(status='succeeded'):
|
||||
if not result.output_files and not result.output_str:
|
||||
continue
|
||||
|
||||
# Try to find the best output file for this plugin
|
||||
plugin_dir = snap_dir / result.plugin
|
||||
best_output = None
|
||||
|
||||
# Check output_files first (new field)
|
||||
if result.output_files:
|
||||
first_file = next(iter(result.output_files.keys()), None)
|
||||
if first_file and (plugin_dir / first_file).exists():
|
||||
best_output = f'{result.plugin}/{first_file}'
|
||||
|
||||
# Fallback to output_str if it looks like a path
|
||||
if not best_output and result.output_str and (snap_dir / result.output_str).exists():
|
||||
best_output = result.output_str
|
||||
|
||||
if not best_output and plugin_dir.exists():
|
||||
# Intelligently find the best file in the plugin's directory
|
||||
best_output = find_best_output_in_dir(plugin_dir, result.plugin)
|
||||
|
||||
if best_output:
|
||||
canonical[f'{result.plugin}_path'] = best_output
|
||||
|
||||
# Also scan top-level for legacy outputs (backwards compatibility)
|
||||
for file_path in snap_dir.glob('*'):
|
||||
if file_path.is_dir() or file_path.name in ('index.html', 'index.json'):
|
||||
continue
|
||||
|
||||
ext = file_path.suffix.lstrip('.').lower()
|
||||
if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
|
||||
continue
|
||||
|
||||
try:
|
||||
size = file_path.stat().st_size
|
||||
if size >= MIN_DISPLAY_SIZE:
|
||||
# Add as generic output with stem as key
|
||||
key = f'{file_path.stem}_path'
|
||||
if key not in canonical:
|
||||
canonical[key] = file_path.name
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
if self.is_static:
|
||||
static_path = f'warc/{self.timestamp}'
|
||||
canonical.update({
|
||||
'title': self.basename,
|
||||
'wget_path': static_path,
|
||||
})
|
||||
|
||||
return canonical
|
||||
|
||||
def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Get the latest output that each plugin produced"""
|
||||
from archivebox.hooks import get_plugins
|
||||
@@ -2078,6 +1960,96 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
latest[plugin] = result.embed_path() if result else None
|
||||
return latest
|
||||
|
||||
def discover_outputs(self) -> list[dict]:
|
||||
"""Discover output files from ArchiveResults and filesystem."""
|
||||
from archivebox.misc.util import ts_to_date_str
|
||||
|
||||
ArchiveResult = self.archiveresult_set.model
|
||||
snap_dir = Path(self.output_dir)
|
||||
outputs: list[dict] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
text_exts = ('.json', '.jsonl', '.txt', '.csv', '.tsv', '.xml', '.yml', '.yaml', '.md', '.log')
|
||||
|
||||
def is_metadata_path(path: str | None) -> bool:
|
||||
lower = (path or '').lower()
|
||||
return lower.endswith(text_exts)
|
||||
|
||||
def is_compact_path(path: str | None) -> bool:
|
||||
lower = (path or '').lower()
|
||||
return lower.endswith(text_exts)
|
||||
|
||||
for result in self.archiveresult_set.all().order_by('start_ts'):
|
||||
embed_path = result.embed_path()
|
||||
if not embed_path or embed_path.strip() in ('.', '/', './'):
|
||||
continue
|
||||
abs_path = snap_dir / embed_path
|
||||
if not abs_path.exists():
|
||||
continue
|
||||
if abs_path.is_dir():
|
||||
if not any(p.is_file() for p in abs_path.rglob('*')):
|
||||
continue
|
||||
size = sum(p.stat().st_size for p in abs_path.rglob('*') if p.is_file())
|
||||
else:
|
||||
size = abs_path.stat().st_size
|
||||
outputs.append({
|
||||
'name': result.plugin,
|
||||
'path': embed_path,
|
||||
'ts': ts_to_date_str(result.end_ts),
|
||||
'size': size or 0,
|
||||
'is_metadata': is_metadata_path(embed_path),
|
||||
'is_compact': is_compact_path(embed_path),
|
||||
'result': result,
|
||||
})
|
||||
seen.add(result.plugin)
|
||||
|
||||
embeddable_exts = {
|
||||
'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl', 'csv', 'tsv',
|
||||
'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
|
||||
'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
|
||||
}
|
||||
|
||||
for entry in snap_dir.iterdir():
|
||||
if entry.name in ('index.html', 'index.json', 'favicon.ico', 'warc'):
|
||||
continue
|
||||
if entry.is_dir():
|
||||
plugin = entry.name
|
||||
if plugin in seen:
|
||||
continue
|
||||
best_file = ArchiveResult._find_best_output_file(entry, plugin)
|
||||
if not best_file:
|
||||
continue
|
||||
rel_path = str(best_file.relative_to(snap_dir))
|
||||
outputs.append({
|
||||
'name': plugin,
|
||||
'path': rel_path,
|
||||
'ts': ts_to_date_str(best_file.stat().st_mtime or 0),
|
||||
'size': best_file.stat().st_size or 0,
|
||||
'is_metadata': is_metadata_path(rel_path),
|
||||
'is_compact': is_compact_path(rel_path),
|
||||
'result': None,
|
||||
})
|
||||
seen.add(plugin)
|
||||
elif entry.is_file():
|
||||
ext = entry.suffix.lstrip('.').lower()
|
||||
if ext not in embeddable_exts:
|
||||
continue
|
||||
plugin = entry.stem
|
||||
if plugin in seen:
|
||||
continue
|
||||
outputs.append({
|
||||
'name': plugin,
|
||||
'path': entry.name,
|
||||
'ts': ts_to_date_str(entry.stat().st_mtime or 0),
|
||||
'size': entry.stat().st_size or 0,
|
||||
'is_metadata': is_metadata_path(entry.name),
|
||||
'is_compact': is_compact_path(entry.name),
|
||||
'result': None,
|
||||
})
|
||||
seen.add(plugin)
|
||||
|
||||
return outputs
|
||||
|
||||
# =========================================================================
|
||||
# Serialization Methods
|
||||
# =========================================================================
|
||||
@@ -2114,8 +2086,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
'num_outputs': self.num_outputs,
|
||||
'num_failures': self.num_failures,
|
||||
}
|
||||
if extended:
|
||||
result['canonical'] = self.canonical_outputs()
|
||||
return result
|
||||
|
||||
def to_json_str(self, indent: int = 4) -> str:
|
||||
@@ -2146,23 +2116,29 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
|
||||
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||
|
||||
canonical = self.canonical_outputs()
|
||||
preview_priority = [
|
||||
'singlefile_path',
|
||||
'screenshot_path',
|
||||
'wget_path',
|
||||
'dom_path',
|
||||
'pdf_path',
|
||||
'readability_path',
|
||||
'singlefile',
|
||||
'screenshot',
|
||||
'wget',
|
||||
'dom',
|
||||
'pdf',
|
||||
'readability',
|
||||
]
|
||||
best_preview_path = next(
|
||||
(canonical.get(key) for key in preview_priority if canonical.get(key)),
|
||||
canonical.get('index_path', 'index.html'),
|
||||
)
|
||||
|
||||
outputs = self.discover_outputs()
|
||||
outputs_by_plugin = {out['name']: out for out in outputs}
|
||||
|
||||
best_preview_path = 'about:blank'
|
||||
for plugin in preview_priority:
|
||||
out = outputs_by_plugin.get(plugin)
|
||||
if out and out.get('path'):
|
||||
best_preview_path = out['path']
|
||||
break
|
||||
|
||||
if best_preview_path == 'about:blank' and outputs:
|
||||
best_preview_path = outputs[0].get('path') or 'about:blank'
|
||||
context = {
|
||||
**self.to_dict(extended=True),
|
||||
**{f'{k}_path': v for k, v in canonical.items()},
|
||||
'canonical': {f'{k}_path': v for k, v in canonical.items()},
|
||||
'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
|
||||
'url_str': htmlencode(urldecode(self.base_url)),
|
||||
'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
|
||||
@@ -2175,6 +2151,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
|
||||
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
|
||||
'best_preview_path': best_preview_path,
|
||||
'archiveresults': outputs,
|
||||
}
|
||||
rendered_html = render_to_string('snapshot.html', context)
|
||||
atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
|
||||
@@ -2496,6 +2473,61 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
def output_exists(self) -> bool:
|
||||
return os.path.exists(Path(self.snapshot_dir) / self.plugin)
|
||||
|
||||
@staticmethod
|
||||
def _find_best_output_file(dir_path: Path, plugin_name: str | None = None) -> Optional[Path]:
|
||||
if not dir_path.exists() or not dir_path.is_dir():
|
||||
return None
|
||||
|
||||
embeddable_exts = {
|
||||
'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl', 'csv', 'tsv',
|
||||
'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
|
||||
'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
|
||||
}
|
||||
|
||||
for name in ('index.html', 'index.htm'):
|
||||
candidate = dir_path / name
|
||||
if candidate.exists() and candidate.is_file():
|
||||
return candidate
|
||||
|
||||
candidates = []
|
||||
file_count = 0
|
||||
max_scan = 200
|
||||
plugin_lower = (plugin_name or '').lower()
|
||||
for file_path in dir_path.rglob('*'):
|
||||
file_count += 1
|
||||
if file_count > max_scan:
|
||||
break
|
||||
if file_path.is_dir() or file_path.name.startswith('.'):
|
||||
continue
|
||||
ext = file_path.suffix.lstrip('.').lower()
|
||||
if ext not in embeddable_exts:
|
||||
continue
|
||||
try:
|
||||
size = file_path.stat().st_size
|
||||
except OSError:
|
||||
continue
|
||||
name_lower = file_path.name.lower()
|
||||
priority = 0
|
||||
if name_lower.startswith('index'):
|
||||
priority = 100
|
||||
elif plugin_lower and name_lower.startswith(('output', 'content', plugin_lower)):
|
||||
priority = 60
|
||||
elif ext in ('html', 'htm', 'pdf'):
|
||||
priority = 40
|
||||
elif ext in ('png', 'jpg', 'jpeg', 'webp', 'svg', 'gif', 'ico'):
|
||||
priority = 30
|
||||
elif ext in ('json', 'jsonl', 'txt', 'md', 'csv', 'tsv'):
|
||||
priority = 20
|
||||
else:
|
||||
priority = 10
|
||||
candidates.append((priority, size, file_path))
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
|
||||
return candidates[0][2]
|
||||
|
||||
def embed_path(self) -> Optional[str]:
|
||||
"""
|
||||
Get the relative path to the embeddable output file for this result.
|
||||
@@ -2503,25 +2535,45 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
Returns the first file from output_files if set, otherwise tries to
|
||||
find a reasonable default based on the plugin type.
|
||||
"""
|
||||
# Check output_files dict for primary output
|
||||
snapshot_dir = Path(self.snapshot_dir)
|
||||
plugin_dir = snapshot_dir / self.plugin
|
||||
|
||||
# Fallback: treat output_str as a file path only if it exists on disk
|
||||
if self.output_str:
|
||||
try:
|
||||
output_path = Path(self.output_str)
|
||||
|
||||
if output_path.is_absolute():
|
||||
# If absolute and within snapshot dir, normalize to relative
|
||||
if snapshot_dir in output_path.parents and output_path.exists():
|
||||
return str(output_path.relative_to(snapshot_dir))
|
||||
else:
|
||||
# If relative, prefer plugin-prefixed path, then direct path
|
||||
if (plugin_dir / output_path).exists():
|
||||
return f'{self.plugin}/{output_path}'
|
||||
if output_path.name in ('index.html', 'index.json') and output_path.parent == Path('.'):
|
||||
return None
|
||||
if (snapshot_dir / output_path).exists():
|
||||
return str(output_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Check output_files dict for primary output (ignore non-output files)
|
||||
if self.output_files:
|
||||
# Return first file from output_files (dict preserves insertion order)
|
||||
first_file = next(iter(self.output_files.keys()), None)
|
||||
if first_file:
|
||||
ignored = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid', 'cmd.sh'}
|
||||
output_candidates = [
|
||||
f for f in self.output_files.keys()
|
||||
if Path(f).name not in ignored
|
||||
]
|
||||
first_file = output_candidates[0] if output_candidates else None
|
||||
if first_file and (plugin_dir / first_file).exists():
|
||||
return f'{self.plugin}/{first_file}'
|
||||
|
||||
# Fallback: check output_str if it looks like a file path
|
||||
if self.output_str and ('/' in self.output_str or '.' in self.output_str):
|
||||
return self.output_str
|
||||
best_file = self._find_best_output_file(plugin_dir, self.plugin)
|
||||
if best_file:
|
||||
return str(best_file.relative_to(snapshot_dir))
|
||||
|
||||
# Try to find output file based on plugin's canonical output path
|
||||
canonical = self.snapshot.canonical_outputs()
|
||||
plugin_key = f'{self.plugin}_path'
|
||||
if plugin_key in canonical:
|
||||
return canonical[plugin_key]
|
||||
|
||||
# Fallback to plugin directory
|
||||
return f'{self.plugin}/'
|
||||
return None
|
||||
|
||||
def create_output_dir(self):
|
||||
output_dir = Path(self.snapshot_dir) / self.plugin
|
||||
@@ -2779,7 +2831,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
self.output_str = 'Hook did not output ArchiveResult record'
|
||||
|
||||
# Walk filesystem and populate output_files, output_size, output_mimetypes
|
||||
exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
|
||||
exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid', 'cmd.sh'}
|
||||
mime_sizes = defaultdict(int)
|
||||
total_size = 0
|
||||
output_files = {}
|
||||
|
||||
@@ -48,6 +48,19 @@ class CustomOutboundWebhookLogFormatter(logging.Formatter):
|
||||
result = super().format(record)
|
||||
return result.replace('HTTP Request: ', 'OutboundWebhook: ')
|
||||
|
||||
class StripANSIColorCodesFilter(logging.Filter):
|
||||
_ansi_re = re.compile(r'\x1b\[[0-9;]*m')
|
||||
_bare_re = re.compile(r'\[[0-9;]*m')
|
||||
|
||||
def filter(self, record) -> bool:
|
||||
msg = record.getMessage()
|
||||
if isinstance(msg, str) and ('\x1b[' in msg or '[m' in msg):
|
||||
msg = self._ansi_re.sub('', msg)
|
||||
msg = self._bare_re.sub('', msg)
|
||||
record.msg = msg
|
||||
record.args = ()
|
||||
return True
|
||||
|
||||
|
||||
ERROR_LOG = tempfile.NamedTemporaryFile().name
|
||||
|
||||
@@ -87,6 +100,9 @@ SETTINGS_LOGGING = {
|
||||
"noisyrequestsfilter": {
|
||||
"()": NoisyRequestsFilter,
|
||||
},
|
||||
"stripansi": {
|
||||
"()": StripANSIColorCodesFilter,
|
||||
},
|
||||
"require_debug_false": {
|
||||
"()": "django.utils.log.RequireDebugFalse",
|
||||
},
|
||||
@@ -101,7 +117,7 @@ SETTINGS_LOGGING = {
|
||||
"level": "DEBUG",
|
||||
"markup": False,
|
||||
"rich_tracebacks": False, # Use standard Python tracebacks (no frame/box)
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
"filters": ["noisyrequestsfilter", "stripansi"],
|
||||
},
|
||||
"logfile": {
|
||||
"level": "INFO",
|
||||
@@ -110,7 +126,7 @@ SETTINGS_LOGGING = {
|
||||
"maxBytes": 1024 * 1024 * 25, # 25 MB
|
||||
"backupCount": 10,
|
||||
"formatter": "rich",
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
"filters": ["noisyrequestsfilter", "stripansi"],
|
||||
},
|
||||
"outbound_webhooks": {
|
||||
"class": "rich.logging.RichHandler",
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
from django import template
|
||||
from django.contrib.admin.templatetags.base import InclusionAdminNode
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.utils.html import escape
|
||||
|
||||
from typing import Union
|
||||
from pathlib import Path
|
||||
|
||||
from archivebox.hooks import (
|
||||
get_plugin_icon, get_plugin_template, get_plugin_name,
|
||||
@@ -57,15 +59,18 @@ def plugin_icon(plugin: str) -> str:
|
||||
|
||||
Usage: {% plugin_icon "screenshot" %}
|
||||
"""
|
||||
return mark_safe(get_plugin_icon(plugin))
|
||||
icon_html = get_plugin_icon(plugin)
|
||||
return mark_safe(
|
||||
f'<span class="abx-plugin-icon" style="display:inline-flex; width:20px; height:20px; align-items:center; justify-content:center;">{icon_html}</span>'
|
||||
)
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def plugin_thumbnail(context, result) -> str:
|
||||
def plugin_card(context, result) -> str:
|
||||
"""
|
||||
Render the thumbnail template for an archive result.
|
||||
Render the card template for an archive result.
|
||||
|
||||
Usage: {% plugin_thumbnail result %}
|
||||
Usage: {% plugin_card result %}
|
||||
|
||||
Context variables passed to template:
|
||||
- result: ArchiveResult object
|
||||
@@ -74,46 +79,97 @@ def plugin_thumbnail(context, result) -> str:
|
||||
- plugin: Plugin base name
|
||||
"""
|
||||
plugin = get_plugin_name(result.plugin)
|
||||
template_str = get_plugin_template(plugin, 'thumbnail')
|
||||
template_str = get_plugin_template(plugin, 'card')
|
||||
|
||||
if not template_str:
|
||||
return ''
|
||||
# Use embed_path() for the display path
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
|
||||
|
||||
# Use embed_path() for the display path (includes canonical paths)
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
|
||||
icon_html = get_plugin_icon(plugin)
|
||||
|
||||
output_lower = (output_path or '').lower()
|
||||
text_preview_exts = ('.json', '.jsonl', '.txt', '.csv', '.tsv', '.xml', '.yml', '.yaml', '.md', '.log')
|
||||
force_text_preview = output_lower.endswith(text_preview_exts)
|
||||
|
||||
# Create a mini template and render it with context
|
||||
try:
|
||||
tpl = template.Template(template_str)
|
||||
ctx = template.Context({
|
||||
'result': result,
|
||||
'snapshot': result.snapshot,
|
||||
'output_path': output_path,
|
||||
'plugin': plugin,
|
||||
})
|
||||
rendered = tpl.render(ctx)
|
||||
# Only return non-empty content (strip whitespace to check)
|
||||
if rendered.strip():
|
||||
return mark_safe(rendered)
|
||||
return ''
|
||||
if template_str and output_path and str(output_path).strip() not in ('.', '/', './') and not force_text_preview:
|
||||
tpl = template.Template(template_str)
|
||||
ctx = template.Context({
|
||||
'result': result,
|
||||
'snapshot': result.snapshot,
|
||||
'output_path': output_path,
|
||||
'plugin': plugin,
|
||||
'plugin_icon': icon_html,
|
||||
})
|
||||
rendered = tpl.render(ctx)
|
||||
# Only return non-empty content (strip whitespace to check)
|
||||
if rendered.strip():
|
||||
return mark_safe(rendered)
|
||||
except Exception:
|
||||
return ''
|
||||
pass
|
||||
|
||||
if force_text_preview and output_path and str(output_path).strip() not in ('.', '/', './'):
|
||||
output_file = Path(output_path)
|
||||
if not output_file.is_absolute():
|
||||
output_file = Path(result.snapshot_dir) / output_path
|
||||
try:
|
||||
output_file = output_file.resolve()
|
||||
snap_dir = Path(result.snapshot_dir).resolve()
|
||||
if snap_dir not in output_file.parents and output_file != snap_dir:
|
||||
output_file = None
|
||||
except Exception:
|
||||
output_file = None
|
||||
if output_file and output_file.exists() and output_file.is_file():
|
||||
try:
|
||||
with output_file.open('rb') as f:
|
||||
raw = f.read(4096)
|
||||
text = raw.decode('utf-8', errors='replace').strip()
|
||||
if text:
|
||||
lines = text.splitlines()[:6]
|
||||
snippet = '\n'.join(lines)
|
||||
escaped = escape(snippet)
|
||||
preview = (
|
||||
f'<div class="thumbnail-text" data-plugin="{plugin}" data-compact="1">'
|
||||
f'<div class="thumbnail-text-header">'
|
||||
f'<span class="thumbnail-compact-icon">{icon_html}</span>'
|
||||
f'<span class="thumbnail-text-title">{plugin}</span>'
|
||||
f'</div>'
|
||||
f'<pre class="thumbnail-text-pre">{escaped}</pre>'
|
||||
f'</div>'
|
||||
)
|
||||
return mark_safe(preview)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if output_lower.endswith(text_preview_exts):
|
||||
fallback_label = 'text'
|
||||
else:
|
||||
fallback_label = 'output'
|
||||
|
||||
fallback = (
|
||||
f'<div class="thumbnail-compact" data-plugin="{plugin}" data-compact="1">'
|
||||
f'<span class="thumbnail-compact-icon">{icon_html}</span>'
|
||||
f'<span class="thumbnail-compact-label">{plugin}</span>'
|
||||
f'<span class="thumbnail-compact-meta">{fallback_label}</span>'
|
||||
f'</div>'
|
||||
)
|
||||
return mark_safe(fallback)
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def plugin_embed(context, result) -> str:
|
||||
def plugin_full(context, result) -> str:
|
||||
"""
|
||||
Render the embed iframe template for an archive result.
|
||||
Render the full template for an archive result.
|
||||
|
||||
Usage: {% plugin_embed result %}
|
||||
Usage: {% plugin_full result %}
|
||||
"""
|
||||
plugin = get_plugin_name(result.plugin)
|
||||
template_str = get_plugin_template(plugin, 'embed')
|
||||
template_str = get_plugin_template(plugin, 'full')
|
||||
|
||||
if not template_str:
|
||||
return ''
|
||||
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
|
||||
|
||||
try:
|
||||
tpl = template.Template(template_str)
|
||||
@@ -132,36 +188,6 @@ def plugin_embed(context, result) -> str:
|
||||
return ''
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def plugin_fullscreen(context, result) -> str:
|
||||
"""
|
||||
Render the fullscreen template for an archive result.
|
||||
|
||||
Usage: {% plugin_fullscreen result %}
|
||||
"""
|
||||
plugin = get_plugin_name(result.plugin)
|
||||
template_str = get_plugin_template(plugin, 'fullscreen')
|
||||
|
||||
if not template_str:
|
||||
return ''
|
||||
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
|
||||
|
||||
try:
|
||||
tpl = template.Template(template_str)
|
||||
ctx = template.Context({
|
||||
'result': result,
|
||||
'snapshot': result.snapshot,
|
||||
'output_path': output_path,
|
||||
'plugin': plugin,
|
||||
})
|
||||
rendered = tpl.render(ctx)
|
||||
# Only return non-empty content (strip whitespace to check)
|
||||
if rendered.strip():
|
||||
return mark_safe(rendered)
|
||||
return ''
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
@register.filter
|
||||
|
||||
@@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView
|
||||
from archivebox.misc.serve_static import serve_static
|
||||
|
||||
from archivebox.core.admin_site import archivebox_admin
|
||||
from archivebox.core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
|
||||
from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, PublicIndexView, AddView, HealthCheckView, live_progress_view
|
||||
|
||||
from archivebox.workers.views import JobsDashboardView
|
||||
|
||||
@@ -32,6 +32,8 @@ urlpatterns = [
|
||||
|
||||
path('archive/', RedirectView.as_view(url='/')),
|
||||
path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
|
||||
re_path(r'^(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<url>https?://.*)$', SnapshotPathView.as_view(), name='snapshot-path-url'),
|
||||
re_path(r'^(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<domain>[^/]+)(?:/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:/(?P<path>.*))?)?$', SnapshotPathView.as_view(), name='snapshot-path'),
|
||||
|
||||
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
|
||||
path('add/', AddView.as_view(), name='add'),
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
import os
|
||||
import sys
|
||||
from django.utils import timezone
|
||||
import inspect
|
||||
from typing import Callable, get_type_hints
|
||||
@@ -26,7 +25,7 @@ import archivebox
|
||||
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
|
||||
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
|
||||
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
|
||||
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode
|
||||
from archivebox.misc.serve_static import serve_static_with_byterange_support
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
from archivebox.search import query_search_index
|
||||
@@ -52,70 +51,44 @@ class HomepageView(View):
|
||||
class SnapshotView(View):
|
||||
# render static html index from filesystem archive/<timestamp>/index.html
|
||||
|
||||
@staticmethod
|
||||
def find_snapshots_for_url(path: str):
|
||||
"""Return a queryset of snapshots matching a URL-ish path."""
|
||||
normalized = path
|
||||
if path.startswith(('http://', 'https://')):
|
||||
# try exact match on full url / ID first
|
||||
qs = Snapshot.objects.filter(Q(url=path) | Q(id__icontains=path))
|
||||
if qs.exists():
|
||||
return qs
|
||||
normalized = path.split('://', 1)[1]
|
||||
|
||||
# try exact match on full url / ID (without scheme)
|
||||
qs = Snapshot.objects.filter(
|
||||
Q(url='http://' + normalized) | Q(url='https://' + normalized) | Q(id__icontains=normalized)
|
||||
)
|
||||
if qs.exists():
|
||||
return qs
|
||||
|
||||
# fall back to match on exact base_url
|
||||
base = base_url(normalized)
|
||||
qs = Snapshot.objects.filter(
|
||||
Q(url='http://' + base) | Q(url='https://' + base)
|
||||
)
|
||||
if qs.exists():
|
||||
return qs
|
||||
|
||||
# fall back to matching base_url as prefix
|
||||
return Snapshot.objects.filter(
|
||||
Q(url__startswith='http://' + base) | Q(url__startswith='https://' + base)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def render_live_index(request, snapshot):
|
||||
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||
|
||||
# Dict of plugin -> ArchiveResult object
|
||||
archiveresult_objects = {}
|
||||
# Dict of plugin -> result info dict (for template compatibility)
|
||||
archiveresults = {}
|
||||
|
||||
results = snapshot.archiveresult_set.all()
|
||||
|
||||
for result in results:
|
||||
embed_path = result.embed_path()
|
||||
abs_path = result.snapshot_dir / (embed_path or 'None')
|
||||
|
||||
if (result.status == 'succeeded'
|
||||
and embed_path
|
||||
and os.access(abs_path, os.R_OK)
|
||||
and abs_path.exists()):
|
||||
if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
|
||||
continue
|
||||
|
||||
# Store the full ArchiveResult object for template tags
|
||||
archiveresult_objects[result.plugin] = result
|
||||
|
||||
result_info = {
|
||||
'name': result.plugin,
|
||||
'path': embed_path,
|
||||
'ts': ts_to_date_str(result.end_ts),
|
||||
'size': abs_path.stat().st_size or '?',
|
||||
'result': result, # Include the full object for template tags
|
||||
}
|
||||
archiveresults[result.plugin] = result_info
|
||||
|
||||
# Use canonical_outputs for intelligent discovery
|
||||
# This method now scans ArchiveResults and uses smart heuristics
|
||||
canonical = snapshot.canonical_outputs()
|
||||
|
||||
# Add any newly discovered outputs from canonical_outputs to archiveresults
|
||||
outputs = snapshot.discover_outputs()
|
||||
archiveresults = {out['name']: out for out in outputs}
|
||||
snap_dir = Path(snapshot.output_dir)
|
||||
for key, path in canonical.items():
|
||||
if not key.endswith('_path') or not path or path.startswith('http'):
|
||||
continue
|
||||
|
||||
plugin_name = key.replace('_path', '')
|
||||
if plugin_name in archiveresults:
|
||||
continue # Already have this from ArchiveResult
|
||||
|
||||
file_path = snap_dir / path
|
||||
if not file_path.exists() or not file_path.is_file():
|
||||
continue
|
||||
|
||||
try:
|
||||
file_size = file_path.stat().st_size
|
||||
if file_size >= 15_000: # Only show files > 15KB
|
||||
archiveresults[plugin_name] = {
|
||||
'name': plugin_name,
|
||||
'path': path,
|
||||
'ts': ts_to_date_str(file_path.stat().st_mtime or 0),
|
||||
'size': file_size,
|
||||
'result': None,
|
||||
}
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
# Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
|
||||
# Convert to base names for display ordering
|
||||
@@ -131,7 +104,7 @@ class SnapshotView(View):
|
||||
preferred_types = tuple(preview_priority + [p for p in all_plugins if p not in preview_priority])
|
||||
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
|
||||
|
||||
best_result = {'path': 'None', 'result': None}
|
||||
best_result = {'path': 'about:blank', 'result': None}
|
||||
for result_type in preferred_types:
|
||||
if result_type in archiveresults:
|
||||
best_result = archiveresults[result_type]
|
||||
@@ -146,7 +119,6 @@ class SnapshotView(View):
|
||||
|
||||
context = {
|
||||
**snapshot_info,
|
||||
**snapshot_info.get('canonical', {}),
|
||||
'title': htmlencode(
|
||||
snapshot.title
|
||||
or (snapshot.base_url if snapshot.is_archived else TITLE_LOADING_MSG)
|
||||
@@ -188,6 +160,14 @@ class SnapshotView(View):
|
||||
try:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
|
||||
canonical_base = snapshot.url_path
|
||||
if canonical_base != snapshot.legacy_archive_path:
|
||||
target_path = f'/{canonical_base}/{archivefile or "index.html"}'
|
||||
query = request.META.get('QUERY_STRING')
|
||||
if query:
|
||||
target_path = f'{target_path}?{query}'
|
||||
return redirect(target_path)
|
||||
|
||||
if archivefile == 'index.html':
|
||||
# if they requested snapshot index, serve live rendered template instead of static html
|
||||
response = self.render_live_index(request, snapshot)
|
||||
@@ -221,9 +201,9 @@ class SnapshotView(View):
|
||||
except Snapshot.MultipleObjectsReturned:
|
||||
snapshot_hrefs = mark_safe('<br/>').join(
|
||||
format_html(
|
||||
'{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
|
||||
'{} <a href="/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
|
||||
snap.bookmarked_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
snap.timestamp,
|
||||
snap.archive_path,
|
||||
snap.timestamp,
|
||||
snap.url,
|
||||
snap.title_stripped[:64] or '',
|
||||
@@ -259,9 +239,9 @@ class SnapshotView(View):
|
||||
#'</script>'
|
||||
'</head><body>'
|
||||
'<center><br/><br/><br/>'
|
||||
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>'
|
||||
f'Snapshot <a href="/{snapshot.archive_path}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>'
|
||||
f'was queued on {str(snapshot.bookmarked_at).split(".")[0]}, '
|
||||
f'but no files have been saved yet in:<br/><b><a href="/archive/{snapshot.timestamp}/" target="_top"><code>{snapshot.timestamp}</code></a><code>/'
|
||||
f'but no files have been saved yet in:<br/><b><a href="/{snapshot.archive_path}/" target="_top"><code>{snapshot.timestamp}</code></a><code>/'
|
||||
'{}'
|
||||
f'</code></b><br/><br/>'
|
||||
'It\'s possible {} '
|
||||
@@ -270,8 +250,8 @@ class SnapshotView(View):
|
||||
f'<code style="user-select: all; color: #333">archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
|
||||
'<div class="text-align: left; width: 100%; max-width: 400px">'
|
||||
'<i><b>Next steps:</i></b><br/>'
|
||||
f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
|
||||
f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
|
||||
f'- list all the <a href="/{snapshot.archive_path}/" target="_top">Snapshot files <code>.*</code></a><br/>'
|
||||
f'- view the <a href="/{snapshot.archive_path}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
|
||||
f'- go to the <a href="/admin/core/snapshot/{snapshot.pk}/change/" target="_top">Snapshot admin</a> to edit<br/>'
|
||||
f'- go to the <a href="/admin/core/snapshot/?id__exact={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
|
||||
'- or return to <a href="/" target="_top">the main index...</a></div>'
|
||||
@@ -288,22 +268,9 @@ class SnapshotView(View):
|
||||
# slug is a URL
|
||||
try:
|
||||
try:
|
||||
# try exact match on full url / ID first
|
||||
snapshot = Snapshot.objects.get(
|
||||
Q(url='http://' + path) | Q(url='https://' + path) | Q(id__icontains=path)
|
||||
)
|
||||
snapshot = SnapshotView.find_snapshots_for_url(path).get()
|
||||
except Snapshot.DoesNotExist:
|
||||
# fall back to match on exact base_url
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(
|
||||
Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path))
|
||||
)
|
||||
except Snapshot.DoesNotExist:
|
||||
# fall back to matching base_url as prefix
|
||||
snapshot = Snapshot.objects.get(
|
||||
Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
|
||||
)
|
||||
return redirect(f'/archive/{snapshot.timestamp}/index.html')
|
||||
raise
|
||||
except Snapshot.DoesNotExist:
|
||||
return HttpResponse(
|
||||
format_html(
|
||||
@@ -322,20 +289,18 @@ class SnapshotView(View):
|
||||
status=404,
|
||||
)
|
||||
except Snapshot.MultipleObjectsReturned:
|
||||
snapshots = SnapshotView.find_snapshots_for_url(path)
|
||||
snapshot_hrefs = mark_safe('<br/>').join(
|
||||
format_html(
|
||||
'{} <code style="font-size: 0.8em">{}</code> <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
|
||||
'{} <code style="font-size: 0.8em">{}</code> <a href="/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
|
||||
snap.bookmarked_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
str(snap.id)[:8],
|
||||
snap.timestamp,
|
||||
snap.archive_path,
|
||||
snap.timestamp,
|
||||
snap.url,
|
||||
snap.title_stripped[:64] or '',
|
||||
)
|
||||
for snap in Snapshot.objects.filter(
|
||||
Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
|
||||
| Q(id__icontains=path)
|
||||
).only('url', 'timestamp', 'title', 'bookmarked_at').order_by('-bookmarked_at')
|
||||
for snap in snapshots.only('url', 'timestamp', 'title', 'bookmarked_at').order_by('-bookmarked_at')
|
||||
)
|
||||
return HttpResponse(
|
||||
format_html(
|
||||
@@ -353,6 +318,108 @@ class SnapshotView(View):
|
||||
status=404,
|
||||
)
|
||||
|
||||
target_path = f'/{snapshot.archive_path}/index.html'
|
||||
query = request.META.get('QUERY_STRING')
|
||||
if query:
|
||||
target_path = f'{target_path}?{query}'
|
||||
return redirect(target_path)
|
||||
|
||||
|
||||
class SnapshotPathView(View):
|
||||
"""Serve snapshots by the new URL scheme: /<username>/<YYYYMMDD>/<domain>/<uuid>/..."""
|
||||
|
||||
def get(self, request, username: str, date: str, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None):
|
||||
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
if username == 'system':
|
||||
return redirect(request.path.replace('/system/', '/web/', 1))
|
||||
|
||||
requested_url = url
|
||||
if not requested_url and domain and domain.startswith(('http://', 'https://')):
|
||||
requested_url = domain
|
||||
|
||||
snapshot = None
|
||||
if snapshot_id:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(pk=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id__startswith=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
snapshot = None
|
||||
except Snapshot.MultipleObjectsReturned:
|
||||
snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first()
|
||||
else:
|
||||
# fuzzy lookup by date + domain/url (most recent)
|
||||
username_lookup = 'system' if username == 'web' else username
|
||||
if requested_url:
|
||||
qs = SnapshotView.find_snapshots_for_url(requested_url).filter(crawl__created_by__username=username_lookup)
|
||||
else:
|
||||
qs = Snapshot.objects.filter(crawl__created_by__username=username_lookup)
|
||||
|
||||
try:
|
||||
if len(date) == 4:
|
||||
qs = qs.filter(created_at__year=int(date))
|
||||
elif len(date) == 6:
|
||||
qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6]))
|
||||
elif len(date) == 8:
|
||||
qs = qs.filter(
|
||||
created_at__year=int(date[:4]),
|
||||
created_at__month=int(date[4:6]),
|
||||
created_at__day=int(date[6:8]),
|
||||
)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if requested_url:
|
||||
snapshot = qs.order_by('-created_at', '-bookmarked_at', '-timestamp').first()
|
||||
else:
|
||||
requested_domain = domain or ''
|
||||
if requested_domain.startswith(('http://', 'https://')):
|
||||
requested_domain = Snapshot.extract_domain_from_url(requested_domain)
|
||||
else:
|
||||
requested_domain = Snapshot.extract_domain_from_url(f'https://{requested_domain}')
|
||||
|
||||
# Prefer exact domain matches
|
||||
matches = [s for s in qs.order_by('-created_at', '-bookmarked_at') if Snapshot.extract_domain_from_url(s.url) == requested_domain]
|
||||
snapshot = matches[0] if matches else qs.order_by('-created_at', '-bookmarked_at', '-timestamp').first()
|
||||
|
||||
if not snapshot:
|
||||
return HttpResponse(
|
||||
format_html(
|
||||
(
|
||||
'<center><br/><br/><br/>'
|
||||
'No Snapshots match the given id or url: <code>{}</code><br/><br/><br/>'
|
||||
'Return to the <a href="/" target="_top">Main Index</a>'
|
||||
'</center>'
|
||||
),
|
||||
snapshot_id or requested_url or domain,
|
||||
),
|
||||
content_type="text/html",
|
||||
status=404,
|
||||
)
|
||||
|
||||
canonical_base = snapshot.url_path
|
||||
requested_base = f'{username}/{date}/{domain or url or ""}'
|
||||
if snapshot_id:
|
||||
requested_base = f'{requested_base}/{snapshot_id}'
|
||||
if canonical_base != requested_base:
|
||||
target = f'/{canonical_base}/{path or "index.html"}'
|
||||
query = request.META.get('QUERY_STRING')
|
||||
if query:
|
||||
target = f'{target}?{query}'
|
||||
return redirect(target)
|
||||
|
||||
archivefile = path or "index.html"
|
||||
|
||||
if archivefile == "index.html":
|
||||
return SnapshotView.render_live_index(request, snapshot)
|
||||
|
||||
return serve_static_with_byterange_support(
|
||||
request, archivefile, document_root=snapshot.output_dir, show_indexes=True,
|
||||
)
|
||||
|
||||
|
||||
class PublicIndexView(ListView):
|
||||
template_name = 'public_index.html'
|
||||
@@ -592,7 +659,7 @@ def live_progress_view(request):
|
||||
'snapshot_id': str(ar.snapshot_id),
|
||||
'snapshot_url': ar.snapshot.url[:60] if ar.snapshot else '',
|
||||
'embed_path': embed,
|
||||
'archive_path': f'/archive/{ar.snapshot.timestamp}/{embed}' if ar.snapshot else '',
|
||||
'archive_path': f'/{ar.snapshot.archive_path}/{embed}' if ar.snapshot else '',
|
||||
'end_ts': ar.end_ts.isoformat() if ar.end_ts else None,
|
||||
})
|
||||
|
||||
|
||||
@@ -71,8 +71,8 @@ def render_snapshots_list(snapshots_qs, limit=20):
|
||||
color: {color}; background: {bg};">{status}</span>
|
||||
</td>
|
||||
<td style="padding: 6px 8px; white-space: nowrap;">
|
||||
<a href="/archive/{snapshot.timestamp}/" style="text-decoration: none;">
|
||||
<img src="/archive/{snapshot.timestamp}/favicon.ico"
|
||||
<a href="/{snapshot.archive_path}/" style="text-decoration: none;">
|
||||
<img src="/{snapshot.archive_path}/favicon.ico"
|
||||
style="width: 16px; height: 16px; vertical-align: middle; margin-right: 4px;"
|
||||
onerror="this.style.display='none'"/>
|
||||
</a>
|
||||
|
||||
@@ -940,9 +940,8 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
||||
# archivebox/plugins/<plugin_name>/
|
||||
# templates/
|
||||
# icon.html # Icon for admin table view (small inline HTML)
|
||||
# thumbnail.html # Preview thumbnail for snapshot cards
|
||||
# embed.html # Iframe embed content for main preview
|
||||
# fullscreen.html # Fullscreen view template
|
||||
# card.html # Preview card for snapshot header
|
||||
# full.html # Fullscreen view template
|
||||
#
|
||||
# Template context variables available:
|
||||
# {{ result }} - ArchiveResult object
|
||||
@@ -953,21 +952,22 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
||||
|
||||
# Default templates used when plugin doesn't provide one
|
||||
DEFAULT_TEMPLATES = {
|
||||
'icon': '''<span title="{{ plugin }}">{{ icon }}</span>''',
|
||||
'thumbnail': '''
|
||||
<img src="{{ output_path }}"
|
||||
alt="{{ plugin }} output"
|
||||
style="max-width: 100%; max-height: 100px; object-fit: cover;"
|
||||
onerror="this.style.display='none'">
|
||||
'icon': '''
|
||||
<span title="{{ plugin }}" style="display:inline-flex; width:20px; height:20px; align-items:center; justify-content:center;">
|
||||
{{ icon }}
|
||||
</span>
|
||||
''',
|
||||
'embed': '''
|
||||
'card': '''
|
||||
<iframe src="{{ output_path }}"
|
||||
class="card-img-top"
|
||||
style="width: 100%; height: 100%; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts">
|
||||
sandbox="allow-same-origin allow-scripts allow-forms"
|
||||
loading="lazy">
|
||||
</iframe>
|
||||
''',
|
||||
'fullscreen': '''
|
||||
'full': '''
|
||||
<iframe src="{{ output_path }}"
|
||||
class="full-page-iframe"
|
||||
style="width: 100%; height: 100vh; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms">
|
||||
</iframe>
|
||||
@@ -981,7 +981,7 @@ def get_plugin_template(plugin: str, template_name: str, fallback: bool = True)
|
||||
|
||||
Args:
|
||||
plugin: Plugin name (e.g., 'screenshot', '15_singlefile')
|
||||
template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen'
|
||||
template_name: One of 'icon', 'card', 'full'
|
||||
fallback: If True, return default template if plugin template not found
|
||||
|
||||
Returns:
|
||||
@@ -1050,7 +1050,7 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
|
||||
|
||||
Returns:
|
||||
Dict mapping plugin names to dicts of template_name -> template_path.
|
||||
e.g., {'screenshot': {'icon': '/path/to/icon.html', 'thumbnail': '/path/to/thumbnail.html'}}
|
||||
e.g., {'screenshot': {'icon': '/path/to/icon.html', 'card': '/path/to/card.html'}}
|
||||
"""
|
||||
templates: Dict[str, Dict[str, str]] = {}
|
||||
|
||||
@@ -1068,7 +1068,7 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
|
||||
|
||||
plugin_templates = {}
|
||||
for template_file in templates_dir.glob('*.html'):
|
||||
template_name = template_file.stem # icon, thumbnail, embed, fullscreen
|
||||
template_name = template_file.stem # icon, card, full
|
||||
plugin_templates[template_name] = str(template_file)
|
||||
|
||||
if plugin_templates:
|
||||
|
||||
@@ -237,8 +237,8 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
sha256 = models.CharField(max_length=64, default='', null=False, blank=True)
|
||||
|
||||
# State machine fields
|
||||
status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True)
|
||||
retry_at = models.DateTimeField(default=timezone.now, null=True, blank=True, db_index=True,
|
||||
status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED, max_length=16)
|
||||
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now,
|
||||
help_text="When to retry this binary installation")
|
||||
|
||||
# Health stats
|
||||
@@ -246,6 +246,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
||||
|
||||
state_machine_name: str = 'archivebox.machine.models.BinaryMachine'
|
||||
active_state: str = StatusChoices.QUEUED
|
||||
|
||||
objects: BinaryManager = BinaryManager()
|
||||
|
||||
|
||||
@@ -49,6 +49,111 @@ const OUTPUT_DIR = '.';
|
||||
let chromePid = null;
|
||||
let browserInstance = null;
|
||||
|
||||
function parseCookiesTxt(contents) {
|
||||
const cookies = [];
|
||||
let skipped = 0;
|
||||
|
||||
for (const rawLine of contents.split(/\r?\n/)) {
|
||||
const line = rawLine.trim();
|
||||
if (!line) continue;
|
||||
|
||||
let httpOnly = false;
|
||||
let dataLine = line;
|
||||
|
||||
if (dataLine.startsWith('#HttpOnly_')) {
|
||||
httpOnly = true;
|
||||
dataLine = dataLine.slice('#HttpOnly_'.length);
|
||||
} else if (dataLine.startsWith('#')) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const parts = dataLine.split('\t');
|
||||
if (parts.length < 7) {
|
||||
skipped += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
const [domainRaw, includeSubdomainsRaw, pathRaw, secureRaw, expiryRaw, name, value] = parts;
|
||||
if (!name || !domainRaw) {
|
||||
skipped += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
const includeSubdomains = (includeSubdomainsRaw || '').toUpperCase() === 'TRUE';
|
||||
let domain = domainRaw;
|
||||
if (includeSubdomains && !domain.startsWith('.')) domain = `.${domain}`;
|
||||
if (!includeSubdomains && domain.startsWith('.')) domain = domain.slice(1);
|
||||
|
||||
const cookie = {
|
||||
name,
|
||||
value,
|
||||
domain,
|
||||
path: pathRaw || '/',
|
||||
secure: (secureRaw || '').toUpperCase() === 'TRUE',
|
||||
httpOnly,
|
||||
};
|
||||
|
||||
const expires = parseInt(expiryRaw, 10);
|
||||
if (!isNaN(expires) && expires > 0) {
|
||||
cookie.expires = expires;
|
||||
}
|
||||
|
||||
cookies.push(cookie);
|
||||
}
|
||||
|
||||
return { cookies, skipped };
|
||||
}
|
||||
|
||||
async function importCookiesFromFile(browser, cookiesFile, userDataDir) {
|
||||
if (!cookiesFile) return;
|
||||
|
||||
if (!fs.existsSync(cookiesFile)) {
|
||||
console.error(`[!] Cookies file not found: ${cookiesFile}`);
|
||||
return;
|
||||
}
|
||||
|
||||
let contents = '';
|
||||
try {
|
||||
contents = fs.readFileSync(cookiesFile, 'utf-8');
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to read COOKIES_TXT_FILE: ${e.message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const { cookies, skipped } = parseCookiesTxt(contents);
|
||||
if (cookies.length === 0) {
|
||||
console.error('[!] No cookies found to import');
|
||||
return;
|
||||
}
|
||||
|
||||
console.error(`[*] Importing ${cookies.length} cookies from ${cookiesFile}...`);
|
||||
if (skipped) {
|
||||
console.error(`[*] Skipped ${skipped} malformed cookie line(s)`);
|
||||
}
|
||||
if (!userDataDir) {
|
||||
console.error('[!] CHROME_USER_DATA_DIR not set; cookies will not persist beyond this session');
|
||||
}
|
||||
|
||||
const page = await browser.newPage();
|
||||
const client = await page.target().createCDPSession();
|
||||
await client.send('Network.enable');
|
||||
|
||||
const chunkSize = 200;
|
||||
let imported = 0;
|
||||
for (let i = 0; i < cookies.length; i += chunkSize) {
|
||||
const chunk = cookies.slice(i, i + chunkSize);
|
||||
try {
|
||||
await client.send('Network.setCookies', { cookies: chunk });
|
||||
imported += chunk.length;
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to import cookies ${i + 1}-${i + chunk.length}: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
await page.close();
|
||||
console.error(`[+] Imported ${imported}/${cookies.length} cookies`);
|
||||
}
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
@@ -118,10 +223,14 @@ async function main() {
|
||||
// Load installed extensions
|
||||
const extensionsDir = getExtensionsDir();
|
||||
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
|
||||
const cookiesFile = getEnv('COOKIES_TXT_FILE') || getEnv('COOKIES_FILE');
|
||||
|
||||
if (userDataDir) {
|
||||
console.error(`[*] Using user data dir: ${userDataDir}`);
|
||||
}
|
||||
if (cookiesFile) {
|
||||
console.error(`[*] Using cookies file: ${cookiesFile}`);
|
||||
}
|
||||
|
||||
const installedExtensions = [];
|
||||
const extensionPaths = [];
|
||||
@@ -179,6 +288,9 @@ async function main() {
|
||||
});
|
||||
browserInstance = browser;
|
||||
|
||||
// Import cookies into Chrome profile at crawl start
|
||||
await importCookiesFromFile(browser, cookiesFile, userDataDir);
|
||||
|
||||
// Get actual extension IDs from chrome://extensions page
|
||||
if (extensionPaths.length > 0) {
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
@@ -38,6 +38,82 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
CHROME_NAVIGATE_HOOK,
|
||||
)
|
||||
|
||||
def _get_cookies_via_cdp(port: int, env: dict) -> list[dict]:
|
||||
node_script = r"""
|
||||
const http = require('http');
|
||||
const WebSocket = require('ws');
|
||||
const port = process.env.CDP_PORT;
|
||||
|
||||
function getTargets() {
|
||||
return new Promise((resolve, reject) => {
|
||||
const req = http.get(`http://127.0.0.1:${port}/json/list`, (res) => {
|
||||
let data = '';
|
||||
res.on('data', (chunk) => (data += chunk));
|
||||
res.on('end', () => {
|
||||
try {
|
||||
resolve(JSON.parse(data));
|
||||
} catch (e) {
|
||||
reject(e);
|
||||
}
|
||||
});
|
||||
});
|
||||
req.on('error', reject);
|
||||
});
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const targets = await getTargets();
|
||||
const pageTarget = targets.find(t => t.type === 'page') || targets[0];
|
||||
if (!pageTarget) {
|
||||
console.error('No page target found');
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
const ws = new WebSocket(pageTarget.webSocketDebuggerUrl);
|
||||
const timer = setTimeout(() => {
|
||||
console.error('Timeout waiting for cookies');
|
||||
process.exit(3);
|
||||
}, 10000);
|
||||
|
||||
ws.on('open', () => {
|
||||
ws.send(JSON.stringify({ id: 1, method: 'Network.getAllCookies' }));
|
||||
});
|
||||
|
||||
ws.on('message', (data) => {
|
||||
const msg = JSON.parse(data);
|
||||
if (msg.id === 1) {
|
||||
clearTimeout(timer);
|
||||
ws.close();
|
||||
if (!msg.result || !msg.result.cookies) {
|
||||
console.error('No cookies in response');
|
||||
process.exit(4);
|
||||
}
|
||||
process.stdout.write(JSON.stringify(msg.result.cookies));
|
||||
process.exit(0);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on('error', (err) => {
|
||||
console.error(String(err));
|
||||
process.exit(5);
|
||||
});
|
||||
})().catch((err) => {
|
||||
console.error(String(err));
|
||||
process.exit(1);
|
||||
});
|
||||
"""
|
||||
|
||||
result = subprocess.run(
|
||||
['node', '-e', node_script],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env | {'CDP_PORT': str(port)},
|
||||
)
|
||||
assert result.returncode == 0, f"Failed to read cookies via CDP: {result.stderr}\nStdout: {result.stdout}"
|
||||
return json.loads(result.stdout or '[]')
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def ensure_chromium_and_puppeteer_installed(tmp_path_factory):
|
||||
"""Ensure Chromium and puppeteer are installed before running tests."""
|
||||
@@ -197,6 +273,77 @@ def test_chrome_launch_and_tab_creation():
|
||||
pass
|
||||
|
||||
|
||||
def test_cookies_imported_on_launch():
|
||||
"""Integration test: COOKIES_TXT_FILE is imported at crawl start."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
cookies_file = Path(tmpdir) / 'cookies.txt'
|
||||
cookies_file.write_text(
|
||||
'\n'.join([
|
||||
'# Netscape HTTP Cookie File',
|
||||
'# https://curl.se/docs/http-cookies.html',
|
||||
'# This file was generated by a test',
|
||||
'',
|
||||
'example.com\tTRUE\t/\tFALSE\t2147483647\tabx_test_cookie\thello',
|
||||
'',
|
||||
])
|
||||
)
|
||||
|
||||
profile_dir = Path(tmpdir) / 'profile'
|
||||
env = get_test_env()
|
||||
env.update({
|
||||
'CHROME_HEADLESS': 'true',
|
||||
'CHROME_USER_DATA_DIR': str(profile_dir),
|
||||
'COOKIES_TXT_FILE': str(cookies_file),
|
||||
})
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-cookies'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
for _ in range(15):
|
||||
if (chrome_dir / 'port.txt').exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert (chrome_dir / 'port.txt').exists(), "port.txt should exist"
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
port = int((chrome_dir / 'port.txt').read_text().strip())
|
||||
|
||||
cookie_found = False
|
||||
for _ in range(15):
|
||||
cookies = _get_cookies_via_cdp(port, env)
|
||||
cookie_found = any(
|
||||
c.get('name') == 'abx_test_cookie' and c.get('value') == 'hello'
|
||||
for c in cookies
|
||||
)
|
||||
if cookie_found:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert cookie_found, "Imported cookie should be present in Chrome session"
|
||||
|
||||
# Cleanup
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def test_chrome_navigation():
|
||||
"""Integration test: Navigate to a URL."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
<!-- Embedded forum view - renders JSONL forum posts -->
|
||||
<div class="extractor-embed forumdl-embed" style="width: 100%; max-width: 900px; margin: 0 auto; background: #1a1a1a; padding: 20px; border-radius: 8px;">
|
||||
<div style="text-align: center; padding: 15px 0; border-bottom: 1px solid #333; margin-bottom: 20px;">
|
||||
<span style="font-size: 32px;">💬</span>
|
||||
<h3 style="margin: 10px 0; color: #fff; font-size: 18px;">Forum Thread</h3>
|
||||
</div>
|
||||
<div id="forum-posts" style="max-height: 500px; overflow-y: auto; color: #ddd;"></div>
|
||||
<script>
|
||||
(async function() {
|
||||
try {
|
||||
const response = await fetch('{{ output_path }}');
|
||||
const text = await response.text();
|
||||
const posts = text.trim().split('\n').map(line => JSON.parse(line));
|
||||
const container = document.getElementById('forum-posts');
|
||||
|
||||
posts.forEach(post => {
|
||||
const postDiv = document.createElement('div');
|
||||
postDiv.style.cssText = 'background: #2a2a2a; padding: 15px; margin-bottom: 15px; border-radius: 5px; border-left: 3px solid #4a9eff;';
|
||||
|
||||
const author = post.author || 'Anonymous';
|
||||
const date = post.date ? new Date(post.date).toLocaleString() : '';
|
||||
const title = post.title || '';
|
||||
const content = post.content || post.body || '';
|
||||
|
||||
postDiv.innerHTML = `
|
||||
<div style="display: flex; justify-content: space-between; margin-bottom: 10px; padding-bottom: 8px; border-bottom: 1px solid #444;">
|
||||
<strong style="color: #4a9eff;">${author}</strong>
|
||||
<span style="color: #888; font-size: 12px;">${date}</span>
|
||||
</div>
|
||||
${title ? `<h4 style="margin: 0 0 10px 0; color: #fff;">${title}</h4>` : ''}
|
||||
<div style="color: #ccc; line-height: 1.5;">${content}</div>
|
||||
`;
|
||||
container.appendChild(postDiv);
|
||||
});
|
||||
} catch(e) {
|
||||
document.getElementById('forum-posts').innerHTML = '<p style="color: #888;">Error loading forum posts</p>';
|
||||
}
|
||||
})();
|
||||
</script>
|
||||
</div>
|
||||
@@ -1,11 +0,0 @@
|
||||
<!-- Embedded gallery view - shows first image with link to full gallery -->
|
||||
<div class="extractor-embed gallerydl-embed" style="width: 100%; max-width: 800px; margin: 0 auto; background: #1a1a1a; padding: 20px;">
|
||||
<img src="{{ output_path }}"
|
||||
style="width: 100%; max-height: 600px; object-fit: contain;"
|
||||
alt="Gallery image"
|
||||
onerror="this.style.display='none'; this.nextElementSibling.style.display='flex';">
|
||||
<div style="display: none; flex-direction: column; align-items: center; color: #888; padding: 40px;">
|
||||
<span style="font-size: 64px;">🖼️</span>
|
||||
<span style="margin-top: 10px;">Gallery downloaded</span>
|
||||
</div>
|
||||
</div>
|
||||
@@ -1,15 +0,0 @@
|
||||
<!-- Embedded paper view - shows PDF viewer -->
|
||||
<div class="extractor-embed papersdl-embed" style="width: 100%; max-width: 900px; margin: 0 auto; background: #1a1a1a; padding: 20px; border-radius: 8px;">
|
||||
<div style="text-align: center; padding: 15px 0; border-bottom: 1px solid #333; margin-bottom: 20px;">
|
||||
<span style="font-size: 32px;">📄</span>
|
||||
<h3 style="margin: 10px 0; color: #fff; font-size: 18px;">Scientific Paper</h3>
|
||||
</div>
|
||||
<div style="width: 100%; height: 500px; background: #2a2a2a; border-radius: 5px; overflow: hidden;">
|
||||
<embed src="{{ output_path }}" type="application/pdf" width="100%" height="100%" />
|
||||
</div>
|
||||
<div style="margin-top: 15px; text-align: center;">
|
||||
<a href="{{ output_path }}" download style="color: #4a9eff; text-decoration: none; padding: 10px 20px; background: #2a2a2a; border-radius: 5px; display: inline-block;">
|
||||
Download PDF
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
@@ -271,12 +271,11 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
|
||||
records.append(record)
|
||||
print(json.dumps(record))
|
||||
|
||||
if records:
|
||||
URLS_FILE.write_text('\n'.join(json.dumps(r) for r in records) + '\n')
|
||||
URLS_FILE.write_text('\n'.join(json.dumps(r) for r in records) + ('\n' if records else ''))
|
||||
|
||||
# Emit ArchiveResult record to mark completion
|
||||
status = 'succeeded' if urls_found else 'skipped'
|
||||
output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
|
||||
output_str = URLS_FILE.name
|
||||
ar_record = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
|
||||
@@ -57,7 +57,7 @@ class TestParseHtmlUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 3 URLs' in result.stderr
|
||||
assert 'urls.jsonl' in result.stderr
|
||||
|
||||
# Parse Snapshot records from stdout
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
|
||||
@@ -78,6 +78,11 @@ class TestParseHtmlUrls:
|
||||
assert '"type": "ArchiveResult"' in result.stdout
|
||||
assert '"status": "succeeded"' in result.stdout
|
||||
|
||||
urls_file = tmp_path / 'urls.jsonl'
|
||||
assert urls_file.exists(), "urls.jsonl not created"
|
||||
file_lines = [line for line in urls_file.read_text().splitlines() if line.strip()]
|
||||
assert len(file_lines) == 3, f"Expected 3 urls.jsonl entries, got {len(file_lines)}"
|
||||
|
||||
def test_ignores_non_http_schemes(self, tmp_path):
|
||||
"""Test that non-http schemes are ignored."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
@@ -194,7 +199,7 @@ class TestParseHtmlUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'No URLs found' in result.stderr
|
||||
assert 'urls.jsonl' in result.stderr
|
||||
assert '"status": "skipped"' in result.stdout
|
||||
|
||||
def test_handles_malformed_html(self, tmp_path):
|
||||
|
||||
@@ -18,6 +18,7 @@ Supports various field names for URL, title, timestamp, and tags.
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from html import unescape
|
||||
from urllib.parse import urlparse
|
||||
@@ -25,6 +26,7 @@ from urllib.parse import urlparse
|
||||
import rich_click as click
|
||||
|
||||
PLUGIN_NAME = 'parse_jsonl_urls'
|
||||
URLS_FILE = Path('urls.jsonl')
|
||||
|
||||
|
||||
def parse_bookmarked_at(link: dict) -> str | None:
|
||||
@@ -188,9 +190,12 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
|
||||
for entry in urls_found:
|
||||
print(json.dumps(entry))
|
||||
|
||||
# Write urls.jsonl to disk for crawl system
|
||||
URLS_FILE.write_text('\n'.join(json.dumps(r) for r in urls_found) + ('\n' if urls_found else ''))
|
||||
|
||||
# Emit ArchiveResult record to mark completion
|
||||
status = 'succeeded' if urls_found else 'skipped'
|
||||
output_str = f'Found {len(urls_found)} URLs, {len(all_tags)} tags' if urls_found else 'No URLs found'
|
||||
output_str = URLS_FILE.name
|
||||
ar_record = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
|
||||
@@ -32,7 +32,7 @@ class TestParseJsonlUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 3 URLs' in result.stdout
|
||||
assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout
|
||||
|
||||
# Output goes to stdout (JSONL)
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
|
||||
@@ -195,7 +195,7 @@ class TestParseJsonlUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'No URLs found' in result.stderr
|
||||
assert 'urls.jsonl' in result.stderr
|
||||
assert '"status": "skipped"' in result.stdout
|
||||
|
||||
def test_exits_1_when_file_not_found(self, tmp_path):
|
||||
|
||||
@@ -16,6 +16,7 @@ import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from html import unescape
|
||||
from urllib.parse import urlparse
|
||||
@@ -23,6 +24,7 @@ from urllib.parse import urlparse
|
||||
import rich_click as click
|
||||
|
||||
PLUGIN_NAME = 'parse_netscape_urls'
|
||||
URLS_FILE = Path('urls.jsonl')
|
||||
|
||||
# Constants for timestamp epoch detection
|
||||
UNIX_EPOCH = 0 # 1970-01-01 00:00:00 UTC
|
||||
@@ -232,9 +234,12 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
|
||||
for entry in urls_found:
|
||||
print(json.dumps(entry))
|
||||
|
||||
# Write urls.jsonl to disk for crawl system
|
||||
URLS_FILE.write_text('\n'.join(json.dumps(r) for r in urls_found) + ('\n' if urls_found else ''))
|
||||
|
||||
# Emit ArchiveResult record to mark completion
|
||||
status = 'succeeded' if urls_found else 'skipped'
|
||||
output_str = f'Found {len(urls_found)} URLs, {len(all_tags)} tags' if urls_found else 'No bookmarks found'
|
||||
output_str = URLS_FILE.name
|
||||
ar_record = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
|
||||
@@ -37,7 +37,7 @@ class TestParseNetscapeUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 3 URLs' in result.stdout
|
||||
assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout
|
||||
|
||||
# Output goes to stdout (JSONL)
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
|
||||
@@ -135,7 +135,7 @@ class TestParseNetscapeUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'No bookmarks found' in result.stderr
|
||||
assert 'urls.jsonl' in result.stderr
|
||||
assert '"status": "skipped"' in result.stdout
|
||||
|
||||
def test_exits_1_when_file_not_found(self, tmp_path):
|
||||
|
||||
@@ -935,7 +935,7 @@ class TestEdgeCases:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 1000 URLs' in result.stdout
|
||||
assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout
|
||||
|
||||
# Output goes to stdout (JSONL) - get all JSONL records
|
||||
all_lines = [line for line in result.stdout.strip().split('\n') if line.strip() and line.startswith('{')]
|
||||
|
||||
@@ -16,6 +16,7 @@ Examples:
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from html import unescape
|
||||
from time import mktime
|
||||
@@ -24,6 +25,7 @@ from urllib.parse import urlparse
|
||||
import rich_click as click
|
||||
|
||||
PLUGIN_NAME = 'parse_rss_urls'
|
||||
URLS_FILE = Path('urls.jsonl')
|
||||
|
||||
try:
|
||||
import feedparser
|
||||
@@ -140,9 +142,12 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
|
||||
for entry in urls_found:
|
||||
print(json.dumps(entry))
|
||||
|
||||
# Write urls.jsonl to disk for crawl system
|
||||
URLS_FILE.write_text('\n'.join(json.dumps(r) for r in urls_found) + ('\n' if urls_found else ''))
|
||||
|
||||
# Emit ArchiveResult record to mark completion
|
||||
status = 'succeeded' if urls_found else 'skipped'
|
||||
output_str = f'Found {len(urls_found)} URLs, {len(all_tags)} tags' if urls_found else 'No URLs found'
|
||||
output_str = URLS_FILE.name
|
||||
ar_record = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
|
||||
@@ -66,7 +66,7 @@ class TestParseRssUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 2 URLs' in result.stdout
|
||||
assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout
|
||||
|
||||
# Output goes to stdout (JSONL)
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
|
||||
@@ -134,7 +134,7 @@ class TestParseRssUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'No URLs found' in result.stderr
|
||||
assert 'urls.jsonl' in result.stderr
|
||||
assert '"status": "skipped"' in result.stdout
|
||||
|
||||
def test_exits_1_when_file_not_found(self, tmp_path):
|
||||
|
||||
@@ -882,7 +882,7 @@ class TestEdgeCases:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 100 URLs' in result.stdout
|
||||
assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout
|
||||
|
||||
# Output goes to stdout (JSONL)
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
|
||||
|
||||
@@ -26,6 +26,7 @@ from urllib.request import urlopen
|
||||
import rich_click as click
|
||||
|
||||
PLUGIN_NAME = 'parse_txt_urls'
|
||||
URLS_FILE = Path('urls.jsonl')
|
||||
|
||||
# URL regex from archivebox/misc/util.py
|
||||
# https://mathiasbynens.be/demo/url-regex
|
||||
@@ -127,6 +128,7 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
|
||||
urls_found.add(cleaned_url)
|
||||
|
||||
# Emit Snapshot records to stdout (JSONL)
|
||||
records = []
|
||||
for found_url in sorted(urls_found):
|
||||
record = {
|
||||
'type': 'Snapshot',
|
||||
@@ -138,11 +140,13 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
|
||||
record['parent_snapshot_id'] = snapshot_id
|
||||
if crawl_id:
|
||||
record['crawl_id'] = crawl_id
|
||||
records.append(record)
|
||||
print(json.dumps(record))
|
||||
|
||||
# Emit ArchiveResult record to mark completion
|
||||
URLS_FILE.write_text('\n'.join(json.dumps(r) for r in records) + ('\n' if records else ''))
|
||||
status = 'succeeded' if urls_found else 'skipped'
|
||||
output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
|
||||
output_str = URLS_FILE.name
|
||||
ar_record = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
|
||||
@@ -32,7 +32,7 @@ https://www.iana.org/domains/reserved
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed: {result.stderr}"
|
||||
assert 'Found 3 URLs' in result.stderr
|
||||
assert 'urls.jsonl' in result.stderr
|
||||
|
||||
# Parse Snapshot records from stdout
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
|
||||
@@ -113,7 +113,7 @@ Also see https://github.com/user/repo for the code.
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'No URLs found' in result.stderr
|
||||
assert 'urls.jsonl' in result.stderr
|
||||
assert '"status": "skipped"' in result.stdout
|
||||
|
||||
def test_exits_1_when_file_not_found(self, tmp_path):
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
<!-- PDF embed - full PDF viewer -->
|
||||
<embed src="{{ output_path }}#toolbar=1&navpanes=1"
|
||||
type="application/pdf"
|
||||
class="extractor-embed pdf-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px;">
|
||||
@@ -31,6 +31,7 @@ PLUGIN_NAME = 'readability'
|
||||
BIN_NAME = 'readability-extractor'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'content.html'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
@@ -130,11 +131,11 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
if not text_content and not html_content:
|
||||
return False, None, 'No content extracted'
|
||||
|
||||
(output_dir / 'content.html').write_text(html_content, encoding='utf-8')
|
||||
(output_dir / OUTPUT_FILE).write_text(html_content, encoding='utf-8')
|
||||
(output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
|
||||
(output_dir / 'article.json').write_text(json.dumps(result_json, indent=2), encoding='utf-8')
|
||||
|
||||
return True, OUTPUT_DIR, ''
|
||||
return True, OUTPUT_FILE, ''
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
|
||||
6
archivebox/plugins/readability/templates/full.html
Normal file
6
archivebox/plugins/readability/templates/full.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Readability fullscreen - show extracted article HTML -->
|
||||
<iframe class="full-page-iframe"
|
||||
src="{{ output_path }}"
|
||||
name="preview"
|
||||
sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
|
||||
</iframe>
|
||||
@@ -1,5 +0,0 @@
|
||||
<!-- Screenshot embed - full image view -->
|
||||
<img src="{{ output_path }}"
|
||||
alt="Screenshot of page"
|
||||
class="extractor-embed screenshot-embed"
|
||||
style="max-width: 100%; height: auto;">
|
||||
@@ -263,7 +263,7 @@ async function main() {
|
||||
const archiveResult = {
|
||||
type: 'ArchiveResult',
|
||||
status,
|
||||
output_str: extractedTitle || error || '',
|
||||
output_str: output || error || '',
|
||||
};
|
||||
console.log(JSON.stringify(archiveResult));
|
||||
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
<!-- YT-DLP embed - video/audio player -->
|
||||
<div class="extractor-embed ytdlp-embed" style="width: 100%; height: 100%; min-height: 400px; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
|
||||
<video src="{{ output_path }}"
|
||||
style="max-width: 100%; max-height: 100%;"
|
||||
controls
|
||||
preload="metadata">
|
||||
Your browser does not support the video tag.
|
||||
</video>
|
||||
</div>
|
||||
@@ -6,12 +6,12 @@
|
||||
</td>
|
||||
<td class="title-col" style="opacity: {% if link.title %}1{% else %}0.3{% endif %}" title="{{link.title|default:'Not yet archived...'}}">
|
||||
{% if link.is_archived %}
|
||||
<a href="/archive/{{link.timestamp}}/index.html"><img src="/archive/{{link.timestamp}}/favicon.ico" onerror="this.style.display='none'" class="link-favicon" decoding="async"></a>
|
||||
<a href="/{{link.archive_path}}/index.html"><img src="/{{link.archive_path}}/favicon.ico" onerror="this.style.display='none'" class="link-favicon" decoding="async"></a>
|
||||
{% else %}
|
||||
<a href="/archive/{{link.timestamp}}/index.html"><img src="{% static 'spinner.gif' %}" onerror="this.style.display='none'" class="link-favicon" decoding="async" style="height: 15px"></a>
|
||||
<a href="/{{link.archive_path}}/index.html"><img src="{% static 'spinner.gif' %}" onerror="this.style.display='none'" class="link-favicon" decoding="async" style="height: 15px"></a>
|
||||
{% endif %}
|
||||
|
||||
<a href="/archive/{{link.timestamp}}/index.html" title="{{link.title|default:'Not yet archived...'}}">
|
||||
<a href="/{{link.archive_path}}/index.html" title="{{link.title|default:'Not yet archived...'}}">
|
||||
<span data-title-for="{{link.url}}" data-archived="{{link.is_archived}}">
|
||||
{{link.title|default:'Loading...'|truncatechars:128}}
|
||||
</span>
|
||||
@@ -29,7 +29,7 @@
|
||||
{% if link.icons %}
|
||||
{{link.icons}} <small style="float:right; opacity: 0.5">{{link.num_outputs}}</small>
|
||||
{% else %}
|
||||
<a href="/archive/{{link.timestamp}}/index.html">
|
||||
<a href="/{{link.archive_path}}/index.html">
|
||||
📄
|
||||
{{link.num_outputs}} <img src="{% static 'spinner.gif' %}" onerror="this.style.display='none'" class="files-spinner" decoding="async" style="height: 15px"/>
|
||||
</a>
|
||||
|
||||
@@ -113,6 +113,10 @@
|
||||
border-radius: 10px;
|
||||
background-color: black;
|
||||
overflow: hidden;
|
||||
min-height: 130px;
|
||||
}
|
||||
.header-bottom-frames .card:has([data-compact]) {
|
||||
min-height: 0;
|
||||
}
|
||||
.card h4 {
|
||||
font-size: 1.4vw;
|
||||
@@ -154,6 +158,202 @@
|
||||
transform: scale(0.25);
|
||||
transform-origin: 0 0;
|
||||
}
|
||||
.row.header-bottom-frames {
|
||||
display: block !important;
|
||||
width: 100%;
|
||||
max-width: 100%;
|
||||
column-width: 180px;
|
||||
column-gap: 8px;
|
||||
column-fill: auto;
|
||||
margin-left: 0px;
|
||||
margin-right: 0px;
|
||||
flex: none !important;
|
||||
}
|
||||
.header-bottom-frames .col-lg-2 {
|
||||
padding-left: 0px;
|
||||
padding-right: 0px;
|
||||
max-width: 100%;
|
||||
width: 100% !important;
|
||||
display: inline-block !important;
|
||||
float: none !important;
|
||||
flex: none !important;
|
||||
break-inside: avoid;
|
||||
margin-bottom: 6px;
|
||||
vertical-align: top;
|
||||
}
|
||||
.header-bottom-frames .card:has([data-compact]) .thumbnail-wrapper,
|
||||
.header-bottom-frames .card:has([data-compact]) .thumbnail-wrapper.compact {
|
||||
height: 32px;
|
||||
}
|
||||
.header-bottom-frames .card:has([data-compact]) .thumbnail-text {
|
||||
height: auto;
|
||||
max-height: 64px;
|
||||
}
|
||||
.header-bottom-frames .card:has([data-compact]) .card-body {
|
||||
padding: 4px 8px;
|
||||
max-height: 44px;
|
||||
}
|
||||
.thumbnail-wrapper {
|
||||
height: 100px;
|
||||
overflow: hidden;
|
||||
background: #333;
|
||||
}
|
||||
.thumbnail-compact {
|
||||
height: 32px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
padding: 0 8px;
|
||||
font-size: 13px;
|
||||
line-height: 1;
|
||||
color: #bdbdbd;
|
||||
background: #111;
|
||||
border-bottom: 1px solid #222;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.02em;
|
||||
}
|
||||
.thumbnail-compact-label {
|
||||
color: #e1e1e1;
|
||||
}
|
||||
.thumbnail-compact-meta {
|
||||
color: #777;
|
||||
font-size: 11px;
|
||||
margin-left: auto;
|
||||
}
|
||||
.thumbnail-compact svg,
|
||||
.thumbnail-compact img {
|
||||
height: 12px;
|
||||
width: 12px;
|
||||
}
|
||||
.thumbnail-text {
|
||||
height: 100px;
|
||||
background: #121212;
|
||||
color: #d8d8d8;
|
||||
padding: 6px 8px;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 4px;
|
||||
font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
|
||||
font-size: 11px;
|
||||
line-height: 1.2;
|
||||
overflow: hidden;
|
||||
}
|
||||
.thumbnail-text-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
font-size: 10px;
|
||||
color: #9b9b9b;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.04em;
|
||||
}
|
||||
.thumbnail-text-pre {
|
||||
margin: 0;
|
||||
white-space: pre-wrap;
|
||||
overflow: hidden;
|
||||
display: -webkit-box;
|
||||
-webkit-box-orient: vertical;
|
||||
-webkit-line-clamp: 5;
|
||||
}
|
||||
.thumbnail-text[data-plugin="title"] .thumbnail-text-pre {
|
||||
font-size: 13px;
|
||||
font-weight: 600;
|
||||
-webkit-line-clamp: 3;
|
||||
}
|
||||
.thumb-grid {
|
||||
display: block;
|
||||
column-width: 180px;
|
||||
column-gap: 6px;
|
||||
align-content: start;
|
||||
width: 100%;
|
||||
}
|
||||
.thumb-card {
|
||||
box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
|
||||
border: 1px solid rgba(0,0,0,3);
|
||||
border-radius: 10px;
|
||||
background-color: black;
|
||||
overflow: hidden;
|
||||
display: inline-block;
|
||||
width: 100%;
|
||||
break-inside: avoid;
|
||||
box-sizing: border-box;
|
||||
margin-bottom: 6px;
|
||||
height: 138px;
|
||||
min-height: 138px;
|
||||
max-height: 138px;
|
||||
}
|
||||
.thumb-card:has([data-compact]) {
|
||||
height: 46px;
|
||||
min-height: 46px;
|
||||
max-height: 46px;
|
||||
}
|
||||
.thumb-card .thumb-body {
|
||||
font-size: 14px;
|
||||
padding: 3px 8px;
|
||||
line-height: 1.2;
|
||||
word-wrap: break-word;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
background-color: #1a1a1a;
|
||||
color: #d3d3d3;
|
||||
}
|
||||
.thumb-card .thumb-body h4 {
|
||||
font-size: 1.1em;
|
||||
margin: 0 0 2px 0;
|
||||
line-height: 1.1;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.thumb-card .thumbnail-wrapper,
|
||||
.thumb-card iframe.card-img-top {
|
||||
display: block;
|
||||
width: 100%;
|
||||
}
|
||||
.thumb-card:has([data-compact]) .thumbnail-wrapper,
|
||||
.thumb-card:has([data-compact]) .thumbnail-wrapper.compact {
|
||||
height: 24px;
|
||||
flex: 0 0 auto;
|
||||
}
|
||||
.thumb-card:has([data-compact]) .thumb-body {
|
||||
padding: 2px 6px;
|
||||
font-size: 12px;
|
||||
max-height: 20px;
|
||||
}
|
||||
.thumb-card:has([data-compact]) .thumb-body h4 {
|
||||
font-size: 0.9em;
|
||||
margin-bottom: 0px;
|
||||
line-height: 1;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 4px;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.thumb-card.selected-card {
|
||||
border: 2px solid orange;
|
||||
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
|
||||
}
|
||||
.thumb-compact .thumbnail-wrapper {
|
||||
height: 32px;
|
||||
}
|
||||
.thumb-compact {
|
||||
margin-bottom: 0px;
|
||||
border-radius: 6px;
|
||||
}
|
||||
.thumb-compact .card-body {
|
||||
display: block;
|
||||
padding: 4px 8px;
|
||||
font-size: 12px;
|
||||
line-height: 1.2;
|
||||
max-height: none;
|
||||
}
|
||||
.thumb-compact .thumbnail-compact,
|
||||
.thumb-compact .thumbnail-text {
|
||||
height: 32px;
|
||||
max-height: 32px;
|
||||
}
|
||||
.full-page-iframe {
|
||||
border-top: 1px solid #ddd;
|
||||
width: 100%;
|
||||
@@ -203,6 +403,10 @@
|
||||
box-shadow: 4px 4px 4px rgba(0,0,0,0.2);
|
||||
margin-top: 0px;
|
||||
}
|
||||
.header-bottom.container-fluid {
|
||||
padding-left: 6px;
|
||||
padding-right: 6px;
|
||||
}
|
||||
.header-bottom-info {
|
||||
color: #6f6f6f;
|
||||
padding-top: 0px;
|
||||
@@ -357,15 +561,15 @@
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="row header-bottom-frames">
|
||||
<div class="thumb-grid">
|
||||
{% for result_info in archiveresults %}
|
||||
{% if result_info.result %}
|
||||
{% plugin_thumbnail result_info.result as thumbnail_html %}
|
||||
{% if thumbnail_html %}
|
||||
<div class="col-lg-2">
|
||||
<div class="card{% if forloop.first %} selected-card{% endif %}">
|
||||
{{ thumbnail_html }}
|
||||
<div class="card-body">
|
||||
{% plugin_card result_info.result as thumbnail_html %}
|
||||
<div class="thumb-card{% if forloop.first %} selected-card{% endif %}">
|
||||
<div class="thumbnail-wrapper">
|
||||
{{ thumbnail_html }}
|
||||
</div>
|
||||
<div class="thumb-body">
|
||||
<a href="{{ result_info.path }}" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<p class="card-text"><code>{{ result_info.path }}</code></p>
|
||||
</a>
|
||||
@@ -373,18 +577,15 @@
|
||||
<h4 class="card-title">{{ result_info.name|title }}</h4>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
{% get_config "PREVIEW_ORIGINALS" as preview_originals %}
|
||||
{% if preview_originals %}
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<div class="thumb-card">
|
||||
<iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy" referrerpolicy="no-referrer"></iframe>
|
||||
<div class="card-body">
|
||||
<div class="thumb-body">
|
||||
<a href="{{url}}" title="Open in new tab..." target="_blank" rel="noopener" referrerpolicy="no-referrer">
|
||||
<p class="card-text"><code>🌐 {{domain}}</code></p>
|
||||
</a>
|
||||
@@ -392,7 +593,6 @@
|
||||
<h4 class="card-title">Original</h4>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
@@ -417,19 +617,31 @@
|
||||
}
|
||||
|
||||
// show selected file in iframe when preview card is clicked
|
||||
jQuery('.card').on('click', function(e) {
|
||||
jQuery('.thumb-card').on('click', function(e) {
|
||||
jQuery('.selected-card').removeClass('selected-card')
|
||||
jQuery(e.target).closest('.card').addClass('selected-card')
|
||||
})
|
||||
jQuery('.card a[target=preview]').on('click', function(e) {
|
||||
if (e.currentTarget.href.endsWith('.pdf')) {
|
||||
jQuery('.full-page-iframe')[0].removeAttribute('sandbox')
|
||||
} else {
|
||||
jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation"
|
||||
jQuery(e.target).closest('.thumb-card').addClass('selected-card')
|
||||
|
||||
const link = e.target.closest('a[target=preview]') || e.currentTarget.querySelector('a[target=preview]') || e.currentTarget.querySelector('a')
|
||||
if (!link || !link.href || link.href.endsWith('#')) {
|
||||
return true
|
||||
}
|
||||
window.location.hash = getPreviewTypeFromPath(e.currentTarget)
|
||||
const iframe = jQuery('.full-page-iframe')[0]
|
||||
if (!iframe) {
|
||||
return true
|
||||
}
|
||||
if (link.href.endsWith('.pdf')) {
|
||||
iframe.removeAttribute('sandbox')
|
||||
} else {
|
||||
iframe.sandbox = "allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation"
|
||||
}
|
||||
window.location.hash = getPreviewTypeFromPath(link)
|
||||
iframe.src = link.href
|
||||
return true
|
||||
})
|
||||
jQuery('.thumb-card a[target=preview]').on('click', function(e) {
|
||||
e.preventDefault()
|
||||
return false
|
||||
})
|
||||
|
||||
function hideSnapshotHeader() {
|
||||
console.log('Collapsing Snapshot header...')
|
||||
@@ -483,7 +695,7 @@
|
||||
for (const link of jQuery('a[target=preview]')) {
|
||||
console.log(link.pathname)
|
||||
if (getPreviewTypeFromPath(link) == window.location.hash.slice(1).toLowerCase()) {
|
||||
jQuery(link).closest('.card').click()
|
||||
jQuery(link).closest('.thumb-card').click()
|
||||
jQuery(link).click()
|
||||
link.click()
|
||||
}
|
||||
@@ -502,7 +714,7 @@
|
||||
|
||||
// hide all preview iframes on small screens
|
||||
if (window.innerWidth < 1091) {
|
||||
jQuery('.card a[target=preview]').attr('target', '_self')
|
||||
jQuery('.thumb-card a[target=preview]').attr('target', '_self')
|
||||
}
|
||||
|
||||
var pdf_frame = document.querySelector('.pdf-frame');
|
||||
|
||||
@@ -130,12 +130,15 @@
|
||||
|
||||
.header-bottom-frames .card {
|
||||
box-shadow: 2px 2px 7px 0px rgba(0, 0, 0, 0.1);
|
||||
margin-bottom: 5px;
|
||||
margin-bottom: 6px;
|
||||
border: 1px solid rgba(0, 0, 0, 0.06);
|
||||
border-radius: 10px;
|
||||
background-color: #efefef;
|
||||
overflow: hidden;
|
||||
height: 130px;
|
||||
min-height: 130px;
|
||||
}
|
||||
.header-bottom-frames .card:has([data-compact]) {
|
||||
min-height: 0;
|
||||
}
|
||||
.card h4 {
|
||||
font-size: 0.8em;
|
||||
@@ -144,7 +147,7 @@
|
||||
text-transform: uppercase;
|
||||
margin-top: 0px;
|
||||
margin-bottom: 5px;
|
||||
color: rgb(93, 105, 110);
|
||||
color: #222;
|
||||
}
|
||||
.card-body {
|
||||
font-size: 14px;
|
||||
@@ -158,7 +161,8 @@
|
||||
max-height: 102px;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
color: #d3d3d3;
|
||||
color: #222;
|
||||
background-color: #f6f6f6;
|
||||
}
|
||||
.card-title {
|
||||
margin-bottom: 4px;
|
||||
@@ -213,6 +217,10 @@
|
||||
background-color: #333;
|
||||
pointer-events: none;
|
||||
}
|
||||
.thumbnail-wrapper.compact {
|
||||
height: 32px;
|
||||
background-color: #111;
|
||||
}
|
||||
.thumbnail-wrapper iframe {
|
||||
width: 405%;
|
||||
height: 430px;
|
||||
@@ -228,10 +236,89 @@
|
||||
object-fit: cover;
|
||||
object-position: top center;
|
||||
}
|
||||
.thumbnail-compact {
|
||||
height: 32px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
padding: 0 8px;
|
||||
font-size: 13px;
|
||||
line-height: 1;
|
||||
color: #bdbdbd;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.02em;
|
||||
}
|
||||
.thumbnail-compact-label {
|
||||
color: #e1e1e1;
|
||||
}
|
||||
.thumbnail-compact-meta {
|
||||
color: #777;
|
||||
font-size: 11px;
|
||||
margin-left: auto;
|
||||
}
|
||||
.thumbnail-compact svg,
|
||||
.thumbnail-compact img {
|
||||
height: 12px;
|
||||
width: 12px;
|
||||
}
|
||||
.thumbnail-text {
|
||||
height: 100px;
|
||||
background: #121212;
|
||||
color: #d8d8d8;
|
||||
padding: 6px 8px;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 4px;
|
||||
font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
|
||||
font-size: 11px;
|
||||
line-height: 1.2;
|
||||
overflow: hidden;
|
||||
}
|
||||
.thumbnail-text-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
font-size: 10px;
|
||||
color: #9b9b9b;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.04em;
|
||||
}
|
||||
.thumbnail-text-pre {
|
||||
margin: 0;
|
||||
white-space: pre-wrap;
|
||||
overflow: hidden;
|
||||
display: -webkit-box;
|
||||
-webkit-box-orient: vertical;
|
||||
-webkit-line-clamp: 5;
|
||||
}
|
||||
.thumbnail-text[data-plugin="title"] .thumbnail-text-pre {
|
||||
font-size: 13px;
|
||||
font-weight: 600;
|
||||
-webkit-line-clamp: 3;
|
||||
}
|
||||
.card.selected-card {
|
||||
border: 2px solid orange;
|
||||
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
|
||||
}
|
||||
.thumb-compact .card-body {
|
||||
display: block;
|
||||
padding: 4px 8px;
|
||||
font-size: 12px;
|
||||
line-height: 1.2;
|
||||
max-height: none;
|
||||
}
|
||||
.thumb-compact {
|
||||
margin-bottom: 0px;
|
||||
border-radius: 6px;
|
||||
}
|
||||
.thumb-compact .thumbnail-wrapper {
|
||||
height: 32px;
|
||||
}
|
||||
.thumb-compact .thumbnail-compact,
|
||||
.thumb-compact .thumbnail-text {
|
||||
height: 32px;
|
||||
max-height: 32px;
|
||||
}
|
||||
.iframe-large {
|
||||
height: calc(100vh - 70px);
|
||||
}
|
||||
@@ -256,6 +343,83 @@
|
||||
object-fit: cover;
|
||||
object-position: top center;
|
||||
}
|
||||
.thumb-grid {
|
||||
display: block;
|
||||
column-width: 180px;
|
||||
column-gap: 6px;
|
||||
align-content: start;
|
||||
width: 100%;
|
||||
}
|
||||
.thumb-card {
|
||||
box-shadow: 2px 2px 7px 0px rgba(0, 0, 0, 0.1);
|
||||
border: 1px solid rgba(0, 0, 0, 0.06);
|
||||
border-radius: 10px;
|
||||
background-color: #efefef;
|
||||
overflow: hidden;
|
||||
display: inline-block;
|
||||
width: 100%;
|
||||
break-inside: avoid;
|
||||
box-sizing: border-box;
|
||||
margin-bottom: 6px;
|
||||
height: 138px;
|
||||
min-height: 138px;
|
||||
max-height: 138px;
|
||||
}
|
||||
.thumb-card:has([data-compact]) {
|
||||
height: 46px;
|
||||
min-height: 46px;
|
||||
max-height: 46px;
|
||||
}
|
||||
.thumb-card .thumb-body {
|
||||
font-size: 14px;
|
||||
padding: 3px 8px;
|
||||
line-height: 1.2;
|
||||
word-wrap: break-word;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
color: #222;
|
||||
background-color: #f6f6f6;
|
||||
}
|
||||
.thumb-card .thumb-body h4 {
|
||||
font-size: 0.8em;
|
||||
text-transform: uppercase;
|
||||
margin: 0 0 2px 0;
|
||||
color: #222;
|
||||
line-height: 1.1;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.thumb-card .thumbnail-wrapper,
|
||||
.thumb-card iframe.card-img-top {
|
||||
display: block;
|
||||
width: 100%;
|
||||
}
|
||||
.thumb-card:has([data-compact]) .thumbnail-wrapper,
|
||||
.thumb-card:has([data-compact]) .thumbnail-wrapper.compact {
|
||||
height: 24px;
|
||||
flex: 0 0 auto;
|
||||
}
|
||||
.thumb-card:has([data-compact]) .thumb-body {
|
||||
padding: 2px 6px;
|
||||
font-size: 12px;
|
||||
max-height: 20px;
|
||||
}
|
||||
.thumb-card:has([data-compact]) .thumb-body h4 {
|
||||
font-size: 0.9em;
|
||||
margin-bottom: 0px;
|
||||
line-height: 1;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 4px;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.thumb-card.selected-card {
|
||||
border: 2px solid orange;
|
||||
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
|
||||
}
|
||||
.header-bottom {
|
||||
border-top: 1px solid rgba(170, 30, 85, 0.9);
|
||||
padding-bottom: 1px;
|
||||
@@ -268,6 +432,10 @@
|
||||
box-shadow: 4px 4px 4px rgba(0,0,0,0.2);
|
||||
margin-top: 0px;
|
||||
}
|
||||
.header-bottom.container-fluid {
|
||||
padding-left: 6px;
|
||||
padding-right: 6px;
|
||||
}
|
||||
.header-bottom-info {
|
||||
color: #6f6f6f;
|
||||
padding-top: 0px;
|
||||
@@ -315,9 +483,41 @@
|
||||
width: 100%;
|
||||
overflow: hidden;
|
||||
}
|
||||
.header-bottom-frames {
|
||||
.row.header-bottom-frames {
|
||||
padding-top: 5px;
|
||||
justify-content: center;
|
||||
display: block !important;
|
||||
width: 100%;
|
||||
max-width: 100%;
|
||||
column-width: 180px;
|
||||
column-gap: 8px;
|
||||
column-fill: auto;
|
||||
margin-left: 0px;
|
||||
margin-right: 0px;
|
||||
flex: none !important;
|
||||
}
|
||||
.header-bottom-frames .col-lg-2 {
|
||||
padding-left: 0px;
|
||||
padding-right: 0px;
|
||||
max-width: 100%;
|
||||
width: 100% !important;
|
||||
display: inline-block !important;
|
||||
float: none !important;
|
||||
flex: none !important;
|
||||
break-inside: avoid;
|
||||
margin-bottom: 6px;
|
||||
vertical-align: top;
|
||||
}
|
||||
.header-bottom-frames .card:has([data-compact]) .thumbnail-wrapper,
|
||||
.header-bottom-frames .card:has([data-compact]) .thumbnail-wrapper.compact {
|
||||
height: 32px;
|
||||
}
|
||||
.header-bottom-frames .card:has([data-compact]) .thumbnail-text {
|
||||
height: auto;
|
||||
max-height: 64px;
|
||||
}
|
||||
.header-bottom-frames .card:has([data-compact]) .card-body {
|
||||
padding: 4px 8px;
|
||||
max-height: 44px;
|
||||
}
|
||||
.header-bottom-frames .card-title {
|
||||
width: 100%;
|
||||
@@ -325,7 +525,7 @@
|
||||
font-size: 17px;
|
||||
margin-bottom: 0px;
|
||||
display: inline-block;
|
||||
color: #d3d3d3;
|
||||
color: #222;
|
||||
font-weight: 200;
|
||||
vertical-align: 3px;
|
||||
}
|
||||
@@ -415,7 +615,7 @@
|
||||
</small>
|
||||
</div>
|
||||
<div class="col-lg-2" style="padding-top: 4px">
|
||||
<a href="/archive/{{url}}" title="Date Added: {{bookmarked_date}} | First Archived: {{oldest_archive_date|default:downloaded_datestr}} | Last Checked: {{downloaded_datestr}} (UTC)">
|
||||
<a href="/{{archive_path}}/index.html" title="Date Added: {{bookmarked_date}} | First Archived: {{oldest_archive_date|default:downloaded_datestr}} | Last Checked: {{downloaded_datestr}} (UTC)">
|
||||
{{oldest_archive_date|default:downloaded_datestr|default:bookmarked_date}}
|
||||
</a>
|
||||
<br/>
|
||||
@@ -431,34 +631,45 @@
|
||||
</div>
|
||||
</div>
|
||||
<div class="header-bottom container-fluid">
|
||||
<div class="row header-bottom-frames">
|
||||
<div class="thumb-grid">
|
||||
|
||||
|
||||
{% for result in archiveresults %}
|
||||
<div class="col-lg-2">
|
||||
<div class="card {% if forloop.first %}selected-card{% endif %}">
|
||||
<div class="card-body">
|
||||
<a href="{{result.path|urlencode}}" target="preview" title="./{{result.path}} (downloaded {{result.ts}})">
|
||||
<div class="thumb-card{% if forloop.first %} selected-card{% endif %}">
|
||||
{% with display_path=result.path %}
|
||||
<div class="thumb-body">
|
||||
{% if display_path %}
|
||||
<a href="{{display_path|urlencode}}" target="preview" title="./{{display_path}} (downloaded {{result.ts}})">
|
||||
<h4>{% plugin_icon result.name %} {{result.name|plugin_name|truncatechars:20}} <small>({{result.size|filesizeformat}})</small></h4>
|
||||
</a>
|
||||
{% else %}
|
||||
<h4>{% plugin_icon result.name %} {{result.name|plugin_name|truncatechars:20}} <small>({{result.size|filesizeformat}})</small></h4>
|
||||
</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% if result.result %}
|
||||
{# Use plugin-specific thumbnail template when ArchiveResult is available #}
|
||||
{% if result.result and display_path %}
|
||||
{# Use plugin-specific card template when ArchiveResult is available #}
|
||||
<div class="card-img-top thumbnail-wrapper">
|
||||
{% plugin_thumbnail result.result %}
|
||||
{% plugin_card result.result %}
|
||||
</div>
|
||||
{% else %}
|
||||
{% elif result.is_metadata and display_path %}
|
||||
<div class="card-img-top thumbnail-wrapper compact">
|
||||
<div class="thumbnail-compact" data-plugin="{{result.name}}">
|
||||
<span class="thumbnail-compact-icon">{% plugin_icon result.name %}</span>
|
||||
<span class="thumbnail-compact-label">{{result.name|plugin_name}}</span>
|
||||
<span class="thumbnail-compact-meta">metadata</span>
|
||||
</div>
|
||||
</div>
|
||||
{% elif display_path %}
|
||||
{# Fall back to generic iframe for filesystem-discovered files #}
|
||||
<iframe class="card-img-top" src="{{result.path|urlencode}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
|
||||
<iframe class="card-img-top" src="{{display_path|urlencode}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endwith %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
|
||||
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<div class="card-body">
|
||||
<div class="thumb-card">
|
||||
<div class="thumb-body">
|
||||
<a href="./" target="preview">
|
||||
<h4>Headers, JSON, etc.</h4>
|
||||
</a>
|
||||
@@ -466,7 +677,6 @@
|
||||
</div>
|
||||
<iframe class="card-img-top" src="./" sandbox="" scrolling="no" loading="lazy"></iframe>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</header>
|
||||
@@ -476,11 +686,11 @@
|
||||
{% if best_result.result %}
|
||||
{# Use plugin-specific fullscreen template when ArchiveResult is available #}
|
||||
<div id="main-frame-wrapper" class="full-page-wrapper">
|
||||
{% plugin_fullscreen best_result.result %}
|
||||
{% plugin_full best_result.result %}
|
||||
</div>
|
||||
{% else %}
|
||||
{# Fall back to generic iframe #}
|
||||
<iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path|urlencode}}" name="preview"></iframe>
|
||||
<iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path|default:'about:blank'|urlencode}}" name="preview"></iframe>
|
||||
{% endif %}
|
||||
|
||||
|
||||
@@ -513,21 +723,45 @@
|
||||
return link.getAttribute('href')
|
||||
}
|
||||
|
||||
const iframe_elem = document.getElementById('main-frame')
|
||||
function ensureMainFrame() {
|
||||
let frame = document.getElementById('main-frame')
|
||||
if (!frame) {
|
||||
const wrapper = document.getElementById('main-frame-wrapper')
|
||||
frame = document.createElement('iframe')
|
||||
frame.id = 'main-frame'
|
||||
frame.name = 'preview'
|
||||
frame.className = 'full-page-iframe'
|
||||
frame.sandbox = "allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms"
|
||||
if (wrapper) {
|
||||
wrapper.innerHTML = ''
|
||||
wrapper.appendChild(frame)
|
||||
wrapper.classList.remove('full-page-wrapper')
|
||||
}
|
||||
}
|
||||
return frame
|
||||
}
|
||||
|
||||
for (const card of [...document.querySelectorAll('.card')]) {
|
||||
for (const card of [...document.querySelectorAll('.thumb-card')]) {
|
||||
card.addEventListener('click', function(event) {
|
||||
const target = event.currentTarget.querySelector('a').href
|
||||
const link = event.target.closest('a[target=preview]') || event.currentTarget.querySelector('a[target=preview]') || event.currentTarget.querySelector('a')
|
||||
if (!link) {
|
||||
return
|
||||
}
|
||||
const target = link.href
|
||||
if (!target || target.endsWith('#')) {
|
||||
return
|
||||
}
|
||||
|
||||
jQuery('.selected-card').removeClass('selected-card')
|
||||
jQuery(event.currentTarget).closest('.card').addClass('selected-card')
|
||||
jQuery(event.currentTarget).closest('.thumb-card').addClass('selected-card')
|
||||
|
||||
const iframe_elem = ensureMainFrame()
|
||||
if (target.endsWith('.pdf')) {
|
||||
jQuery('#main-frame')[0].removeAttribute('sandbox')
|
||||
iframe_elem.removeAttribute('sandbox')
|
||||
} else {
|
||||
jQuery('#main-frame')[0].sandbox = "allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms"
|
||||
iframe_elem.sandbox = "allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms"
|
||||
}
|
||||
window.location.hash = getPreviewTypeFromPath(event.currentTarget.querySelector('a'))
|
||||
window.location.hash = getPreviewTypeFromPath(link)
|
||||
|
||||
iframe_elem.src = target
|
||||
})
|
||||
@@ -587,7 +821,7 @@
|
||||
for (const link of jQuery('a[target=preview]')) {
|
||||
console.log(link.pathname)
|
||||
if (getPreviewTypeFromPath(link) == window.location.hash.slice(1).toLowerCase()) {
|
||||
jQuery(link).closest('.card').click()
|
||||
jQuery(link).closest('.thumb-card').click()
|
||||
jQuery(link).click()
|
||||
link.click()
|
||||
}
|
||||
|
||||
@@ -698,7 +698,7 @@ class SnapshotWorker(Worker):
|
||||
|
||||
try:
|
||||
# Get merged config (includes env vars passed via Process.env, snapshot.config, defaults, etc.)
|
||||
config = get_config(snapshot=self.snapshot)
|
||||
config = get_config(snapshot=self.snapshot, crawl=self.snapshot.crawl)
|
||||
|
||||
# Discover all hooks for this snapshot
|
||||
hooks = discover_hooks('Snapshot', config=config)
|
||||
@@ -842,14 +842,13 @@ class SnapshotWorker(Worker):
|
||||
# Clear to avoid double-termination during on_shutdown
|
||||
self.background_processes = {}
|
||||
|
||||
# Update STARTED background results now that hooks are done
|
||||
# Update background results now that hooks are done
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
started_bg = self.snapshot.archiveresult_set.filter(
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
bg_results = self.snapshot.archiveresult_set.filter(
|
||||
hook_name__contains='.bg.',
|
||||
)
|
||||
for ar in started_bg:
|
||||
for ar in bg_results:
|
||||
ar.update_from_output()
|
||||
|
||||
def _reap_background_hooks(self) -> None:
|
||||
@@ -867,7 +866,7 @@ class SnapshotWorker(Worker):
|
||||
continue
|
||||
|
||||
ar = self.snapshot.archiveresult_set.filter(hook_name=hook_name).first()
|
||||
if ar and ar.status == ArchiveResult.StatusChoices.STARTED:
|
||||
if ar:
|
||||
ar.update_from_output()
|
||||
|
||||
# Remove completed hook from tracking
|
||||
|
||||
Reference in New Issue
Block a user