cleanup tui, startup, card templtes, and more

This commit is contained in:
Nick Sweeting
2026-01-19 14:33:20 -08:00
parent bef67760db
commit 86e7973334
68 changed files with 1370 additions and 546 deletions

5
.claude/settings.json Normal file
View File

@@ -0,0 +1,5 @@
{
"enabledPlugins": {
"pyright-lsp@claude-plugins-official": true
}
}

View File

@@ -30,10 +30,13 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
from archivebox.misc.checks import check_data_folder
check_data_folder()
from django.core.management import call_command
from django.contrib.auth.models import User
from archivebox.config.common import SHELL_CONFIG
run_in_debug = SHELL_CONFIG.DEBUG or debug or reload
if debug or reload:
SHELL_CONFIG.DEBUG = True
from django.contrib.auth.models import User
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
print()
@@ -56,7 +59,8 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
except IndexError:
pass
if SHELL_CONFIG.DEBUG:
if run_in_debug:
from django.core.management import call_command
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')

View File

@@ -57,7 +57,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
# Build output link - use embed_path() which checks output_files first
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
output_link = f'/archive/{result.snapshot.timestamp}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
output_link = f'/{result.snapshot.archive_path}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/{result.snapshot.archive_path}/'
# Get version - try cmd_version field
version = result.cmd_version if result.cmd_version else '-'
@@ -83,8 +83,8 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
{icon}
</td>
<td style="padding: 10px 12px; font-weight: 500; color: #334155;">
<a href="{output_link}" target="_blank"
style="color: #334155; text-decoration: none;"
<a href="{output_link}" target="_blank"
style="color: #334155; text-decoration: none;"
title="View output fullscreen"
onmouseover="this.style.color='#2563eb'; this.style.textDecoration='underline';"
onmouseout="this.style.color='#334155'; this.style.textDecoration='none';">
@@ -301,8 +301,8 @@ class ArchiveResultAdmin(BaseModelAdmin):
)
def snapshot_info(self, result):
return format_html(
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b> &nbsp; {} &nbsp; {}</a><br/>',
result.snapshot.timestamp,
'<a href="/{}/index.html"><b><code>[{}]</code></b> &nbsp; {} &nbsp; {}</a><br/>',
result.snapshot.archive_path,
str(result.snapshot.id)[:8],
result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'),
result.snapshot.url[:128],
@@ -336,8 +336,8 @@ class ArchiveResultAdmin(BaseModelAdmin):
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
return format_html(
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
result.snapshot.timestamp,
'<a href="/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
result.snapshot.archive_path,
output_path,
result.output_str,
)
@@ -348,7 +348,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
'<pre style="display: inline-block">{}</pre><br/>',
result.output_str,
)
output_html += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
output_html += format_html('<a href="/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.archive_path))
embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
path_from_embed = (snapshot_dir / (embed_path or ''))
output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))

View File

@@ -237,13 +237,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
'''
<div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/archive/{}"
href="/{}"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📄 Summary Page
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/archive/{}/index.html#all"
href="/{}/index.html#all"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📁 Result Files
@@ -291,8 +291,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
<b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
</p>
''',
obj.timestamp,
obj.timestamp,
obj.archive_path,
obj.archive_path,
obj.url,
obj.pk,
obj.pk,
@@ -310,7 +310,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
'' if obj.is_archived else '',
obj.num_outputs,
self.size(obj) or '0kb',
f'/archive/{obj.timestamp}/favicon.ico',
f'/{obj.archive_path}/favicon.ico',
obj.extension or '-',
)

View File

@@ -1,6 +1,7 @@
__package__ = 'archivebox.core'
import ipaddress
import re
from django.utils import timezone
from django.contrib.auth.middleware import RemoteUserMiddleware
from django.core.exceptions import ImproperlyConfigured
@@ -28,10 +29,11 @@ def TimezoneMiddleware(get_response):
def CacheControlMiddleware(get_response):
snapshot_path_re = re.compile(r"^/[^/]+/\\d{8}/[^/]+/[0-9a-fA-F-]{8,36}/")
def middleware(request):
response = get_response(request)
if '/archive/' in request.path or '/static/' in request.path:
if '/archive/' in request.path or '/static/' in request.path or snapshot_path_re.match(request.path):
policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
# print('Set Cache-Control header to', response['Cache-Control'])

View File

@@ -1296,7 +1296,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
)}
path = self.archive_path
canon = self.canonical_outputs()
output = ""
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
@@ -1313,10 +1312,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if not icon.strip() and not existing:
continue
embed_path = result.embed_path() if result else f'{plugin}/'
output += format_html(
output_template,
path,
canon.get(plugin, plugin + '/'),
embed_path,
str(bool(existing)),
plugin,
icon
@@ -1402,9 +1402,38 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return
@cached_property
def archive_path(self):
def legacy_archive_path(self) -> str:
return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'
@cached_property
def url_path(self) -> str:
"""URL path matching the current snapshot output_dir layout."""
try:
rel_path = Path(self.output_dir).resolve().relative_to(CONSTANTS.DATA_DIR)
except Exception:
return self.legacy_archive_path
parts = rel_path.parts
# New layout: users/<username>/snapshots/<YYYYMMDD>/<domain>/<uuid>/
if len(parts) >= 6 and parts[0] == 'users' and parts[2] == 'snapshots':
username = parts[1]
if username == 'system':
username = 'web'
date_str = parts[3]
domain = parts[4]
snapshot_id = parts[5]
return f'{username}/{date_str}/{domain}/{snapshot_id}'
# Legacy layout: archive/<timestamp>/
if len(parts) >= 2 and parts[0] == CONSTANTS.ARCHIVE_DIR_NAME:
return f'{parts[0]}/{parts[1]}'
return '/'.join(parts)
@cached_property
def archive_path(self):
return self.url_path
@cached_property
def archive_size(self):
try:
@@ -1467,8 +1496,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
for pid_file in Path(self.output_dir).glob('**/*.pid'):
pid_file.unlink(missing_ok=True)
# Update all STARTED ArchiveResults from filesystem
results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
# Update all background ArchiveResults from filesystem (in case output arrived late)
results = self.archiveresult_set.filter(hook_name__contains='.bg.')
for ar in results:
ar.update_from_output()
@@ -1914,153 +1943,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Output Path Methods (migrated from Link schema)
# =========================================================================
def canonical_outputs(self) -> Dict[str, Optional[str]]:
"""
Intelligently discover the best output file for each plugin.
Uses actual ArchiveResult data and filesystem scanning with smart heuristics.
"""
FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
# Mimetypes that can be embedded/previewed in an iframe
IFRAME_EMBEDDABLE_EXTENSIONS = {
'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl',
'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
}
MIN_DISPLAY_SIZE = 15_000 # 15KB - filter out tiny files
MAX_SCAN_FILES = 50 # Don't scan massive directories
def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]:
"""Find the best representative file in a plugin's output directory"""
if not dir_path.exists() or not dir_path.is_dir():
return None
candidates = []
file_count = 0
# Special handling for media plugin - look for thumbnails
is_media_dir = plugin_name == 'media'
# Scan for suitable files
for file_path in dir_path.rglob('*'):
file_count += 1
if file_count > MAX_SCAN_FILES:
break
if file_path.is_dir() or file_path.name.startswith('.'):
continue
ext = file_path.suffix.lstrip('.').lower()
if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
continue
try:
size = file_path.stat().st_size
except OSError:
continue
# For media dir, allow smaller image files (thumbnails are often < 15KB)
min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE
if size < min_size:
continue
# Prefer main files: index.html, output.*, content.*, etc.
priority = 0
name_lower = file_path.name.lower()
if is_media_dir:
# Special prioritization for media directories
if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')):
priority = 200 # Highest priority for thumbnails
elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'):
priority = 150 # High priority for any image
elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'):
priority = 100 # Lower priority for actual media files
else:
priority = 50
elif 'index' in name_lower:
priority = 100
elif name_lower.startswith(('output', 'content', plugin_name)):
priority = 50
elif ext in ('html', 'htm', 'pdf'):
priority = 30
elif ext in ('png', 'jpg', 'jpeg', 'webp'):
priority = 20
else:
priority = 10
candidates.append((priority, size, file_path))
if not candidates:
return None
# Sort by priority (desc), then size (desc)
candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
best_file = candidates[0][2]
return str(best_file.relative_to(Path(self.output_dir)))
canonical = {
'index_path': 'index.html',
'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
'archivedotorg_path': f'https://web.archive.org/web/{self.base_url}',
}
# Scan each ArchiveResult's output directory for the best file
snap_dir = Path(self.output_dir)
for result in self.archiveresult_set.filter(status='succeeded'):
if not result.output_files and not result.output_str:
continue
# Try to find the best output file for this plugin
plugin_dir = snap_dir / result.plugin
best_output = None
# Check output_files first (new field)
if result.output_files:
first_file = next(iter(result.output_files.keys()), None)
if first_file and (plugin_dir / first_file).exists():
best_output = f'{result.plugin}/{first_file}'
# Fallback to output_str if it looks like a path
if not best_output and result.output_str and (snap_dir / result.output_str).exists():
best_output = result.output_str
if not best_output and plugin_dir.exists():
# Intelligently find the best file in the plugin's directory
best_output = find_best_output_in_dir(plugin_dir, result.plugin)
if best_output:
canonical[f'{result.plugin}_path'] = best_output
# Also scan top-level for legacy outputs (backwards compatibility)
for file_path in snap_dir.glob('*'):
if file_path.is_dir() or file_path.name in ('index.html', 'index.json'):
continue
ext = file_path.suffix.lstrip('.').lower()
if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
continue
try:
size = file_path.stat().st_size
if size >= MIN_DISPLAY_SIZE:
# Add as generic output with stem as key
key = f'{file_path.stem}_path'
if key not in canonical:
canonical[key] = file_path.name
except OSError:
continue
if self.is_static:
static_path = f'warc/{self.timestamp}'
canonical.update({
'title': self.basename,
'wget_path': static_path,
})
return canonical
def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
"""Get the latest output that each plugin produced"""
from archivebox.hooks import get_plugins
@@ -2078,6 +1960,96 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
latest[plugin] = result.embed_path() if result else None
return latest
def discover_outputs(self) -> list[dict]:
"""Discover output files from ArchiveResults and filesystem."""
from archivebox.misc.util import ts_to_date_str
ArchiveResult = self.archiveresult_set.model
snap_dir = Path(self.output_dir)
outputs: list[dict] = []
seen: set[str] = set()
text_exts = ('.json', '.jsonl', '.txt', '.csv', '.tsv', '.xml', '.yml', '.yaml', '.md', '.log')
def is_metadata_path(path: str | None) -> bool:
lower = (path or '').lower()
return lower.endswith(text_exts)
def is_compact_path(path: str | None) -> bool:
lower = (path or '').lower()
return lower.endswith(text_exts)
for result in self.archiveresult_set.all().order_by('start_ts'):
embed_path = result.embed_path()
if not embed_path or embed_path.strip() in ('.', '/', './'):
continue
abs_path = snap_dir / embed_path
if not abs_path.exists():
continue
if abs_path.is_dir():
if not any(p.is_file() for p in abs_path.rglob('*')):
continue
size = sum(p.stat().st_size for p in abs_path.rglob('*') if p.is_file())
else:
size = abs_path.stat().st_size
outputs.append({
'name': result.plugin,
'path': embed_path,
'ts': ts_to_date_str(result.end_ts),
'size': size or 0,
'is_metadata': is_metadata_path(embed_path),
'is_compact': is_compact_path(embed_path),
'result': result,
})
seen.add(result.plugin)
embeddable_exts = {
'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl', 'csv', 'tsv',
'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
}
for entry in snap_dir.iterdir():
if entry.name in ('index.html', 'index.json', 'favicon.ico', 'warc'):
continue
if entry.is_dir():
plugin = entry.name
if plugin in seen:
continue
best_file = ArchiveResult._find_best_output_file(entry, plugin)
if not best_file:
continue
rel_path = str(best_file.relative_to(snap_dir))
outputs.append({
'name': plugin,
'path': rel_path,
'ts': ts_to_date_str(best_file.stat().st_mtime or 0),
'size': best_file.stat().st_size or 0,
'is_metadata': is_metadata_path(rel_path),
'is_compact': is_compact_path(rel_path),
'result': None,
})
seen.add(plugin)
elif entry.is_file():
ext = entry.suffix.lstrip('.').lower()
if ext not in embeddable_exts:
continue
plugin = entry.stem
if plugin in seen:
continue
outputs.append({
'name': plugin,
'path': entry.name,
'ts': ts_to_date_str(entry.stat().st_mtime or 0),
'size': entry.stat().st_size or 0,
'is_metadata': is_metadata_path(entry.name),
'is_compact': is_compact_path(entry.name),
'result': None,
})
seen.add(plugin)
return outputs
# =========================================================================
# Serialization Methods
# =========================================================================
@@ -2114,8 +2086,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'num_outputs': self.num_outputs,
'num_failures': self.num_failures,
}
if extended:
result['canonical'] = self.canonical_outputs()
return result
def to_json_str(self, indent: int = 4) -> str:
@@ -2146,23 +2116,29 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
TITLE_LOADING_MSG = 'Not yet archived...'
canonical = self.canonical_outputs()
preview_priority = [
'singlefile_path',
'screenshot_path',
'wget_path',
'dom_path',
'pdf_path',
'readability_path',
'singlefile',
'screenshot',
'wget',
'dom',
'pdf',
'readability',
]
best_preview_path = next(
(canonical.get(key) for key in preview_priority if canonical.get(key)),
canonical.get('index_path', 'index.html'),
)
outputs = self.discover_outputs()
outputs_by_plugin = {out['name']: out for out in outputs}
best_preview_path = 'about:blank'
for plugin in preview_priority:
out = outputs_by_plugin.get(plugin)
if out and out.get('path'):
best_preview_path = out['path']
break
if best_preview_path == 'about:blank' and outputs:
best_preview_path = outputs[0].get('path') or 'about:blank'
context = {
**self.to_dict(extended=True),
**{f'{k}_path': v for k, v in canonical.items()},
'canonical': {f'{k}_path': v for k, v in canonical.items()},
'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
'url_str': htmlencode(urldecode(self.base_url)),
'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
@@ -2175,6 +2151,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'best_preview_path': best_preview_path,
'archiveresults': outputs,
}
rendered_html = render_to_string('snapshot.html', context)
atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
@@ -2496,6 +2473,61 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def output_exists(self) -> bool:
return os.path.exists(Path(self.snapshot_dir) / self.plugin)
@staticmethod
def _find_best_output_file(dir_path: Path, plugin_name: str | None = None) -> Optional[Path]:
if not dir_path.exists() or not dir_path.is_dir():
return None
embeddable_exts = {
'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl', 'csv', 'tsv',
'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
}
for name in ('index.html', 'index.htm'):
candidate = dir_path / name
if candidate.exists() and candidate.is_file():
return candidate
candidates = []
file_count = 0
max_scan = 200
plugin_lower = (plugin_name or '').lower()
for file_path in dir_path.rglob('*'):
file_count += 1
if file_count > max_scan:
break
if file_path.is_dir() or file_path.name.startswith('.'):
continue
ext = file_path.suffix.lstrip('.').lower()
if ext not in embeddable_exts:
continue
try:
size = file_path.stat().st_size
except OSError:
continue
name_lower = file_path.name.lower()
priority = 0
if name_lower.startswith('index'):
priority = 100
elif plugin_lower and name_lower.startswith(('output', 'content', plugin_lower)):
priority = 60
elif ext in ('html', 'htm', 'pdf'):
priority = 40
elif ext in ('png', 'jpg', 'jpeg', 'webp', 'svg', 'gif', 'ico'):
priority = 30
elif ext in ('json', 'jsonl', 'txt', 'md', 'csv', 'tsv'):
priority = 20
else:
priority = 10
candidates.append((priority, size, file_path))
if not candidates:
return None
candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
return candidates[0][2]
def embed_path(self) -> Optional[str]:
"""
Get the relative path to the embeddable output file for this result.
@@ -2503,25 +2535,45 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
Returns the first file from output_files if set, otherwise tries to
find a reasonable default based on the plugin type.
"""
# Check output_files dict for primary output
snapshot_dir = Path(self.snapshot_dir)
plugin_dir = snapshot_dir / self.plugin
# Fallback: treat output_str as a file path only if it exists on disk
if self.output_str:
try:
output_path = Path(self.output_str)
if output_path.is_absolute():
# If absolute and within snapshot dir, normalize to relative
if snapshot_dir in output_path.parents and output_path.exists():
return str(output_path.relative_to(snapshot_dir))
else:
# If relative, prefer plugin-prefixed path, then direct path
if (plugin_dir / output_path).exists():
return f'{self.plugin}/{output_path}'
if output_path.name in ('index.html', 'index.json') and output_path.parent == Path('.'):
return None
if (snapshot_dir / output_path).exists():
return str(output_path)
except Exception:
pass
# Check output_files dict for primary output (ignore non-output files)
if self.output_files:
# Return first file from output_files (dict preserves insertion order)
first_file = next(iter(self.output_files.keys()), None)
if first_file:
ignored = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid', 'cmd.sh'}
output_candidates = [
f for f in self.output_files.keys()
if Path(f).name not in ignored
]
first_file = output_candidates[0] if output_candidates else None
if first_file and (plugin_dir / first_file).exists():
return f'{self.plugin}/{first_file}'
# Fallback: check output_str if it looks like a file path
if self.output_str and ('/' in self.output_str or '.' in self.output_str):
return self.output_str
best_file = self._find_best_output_file(plugin_dir, self.plugin)
if best_file:
return str(best_file.relative_to(snapshot_dir))
# Try to find output file based on plugin's canonical output path
canonical = self.snapshot.canonical_outputs()
plugin_key = f'{self.plugin}_path'
if plugin_key in canonical:
return canonical[plugin_key]
# Fallback to plugin directory
return f'{self.plugin}/'
return None
def create_output_dir(self):
output_dir = Path(self.snapshot_dir) / self.plugin
@@ -2779,7 +2831,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.output_str = 'Hook did not output ArchiveResult record'
# Walk filesystem and populate output_files, output_size, output_mimetypes
exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid', 'cmd.sh'}
mime_sizes = defaultdict(int)
total_size = 0
output_files = {}

View File

@@ -48,6 +48,19 @@ class CustomOutboundWebhookLogFormatter(logging.Formatter):
result = super().format(record)
return result.replace('HTTP Request: ', 'OutboundWebhook: ')
class StripANSIColorCodesFilter(logging.Filter):
_ansi_re = re.compile(r'\x1b\[[0-9;]*m')
_bare_re = re.compile(r'\[[0-9;]*m')
def filter(self, record) -> bool:
msg = record.getMessage()
if isinstance(msg, str) and ('\x1b[' in msg or '[m' in msg):
msg = self._ansi_re.sub('', msg)
msg = self._bare_re.sub('', msg)
record.msg = msg
record.args = ()
return True
ERROR_LOG = tempfile.NamedTemporaryFile().name
@@ -87,6 +100,9 @@ SETTINGS_LOGGING = {
"noisyrequestsfilter": {
"()": NoisyRequestsFilter,
},
"stripansi": {
"()": StripANSIColorCodesFilter,
},
"require_debug_false": {
"()": "django.utils.log.RequireDebugFalse",
},
@@ -101,7 +117,7 @@ SETTINGS_LOGGING = {
"level": "DEBUG",
"markup": False,
"rich_tracebacks": False, # Use standard Python tracebacks (no frame/box)
"filters": ["noisyrequestsfilter"],
"filters": ["noisyrequestsfilter", "stripansi"],
},
"logfile": {
"level": "INFO",
@@ -110,7 +126,7 @@ SETTINGS_LOGGING = {
"maxBytes": 1024 * 1024 * 25, # 25 MB
"backupCount": 10,
"formatter": "rich",
"filters": ["noisyrequestsfilter"],
"filters": ["noisyrequestsfilter", "stripansi"],
},
"outbound_webhooks": {
"class": "rich.logging.RichHandler",

View File

@@ -1,8 +1,10 @@
from django import template
from django.contrib.admin.templatetags.base import InclusionAdminNode
from django.utils.safestring import mark_safe
from django.utils.html import escape
from typing import Union
from pathlib import Path
from archivebox.hooks import (
get_plugin_icon, get_plugin_template, get_plugin_name,
@@ -57,15 +59,18 @@ def plugin_icon(plugin: str) -> str:
Usage: {% plugin_icon "screenshot" %}
"""
return mark_safe(get_plugin_icon(plugin))
icon_html = get_plugin_icon(plugin)
return mark_safe(
f'<span class="abx-plugin-icon" style="display:inline-flex; width:20px; height:20px; align-items:center; justify-content:center;">{icon_html}</span>'
)
@register.simple_tag(takes_context=True)
def plugin_thumbnail(context, result) -> str:
def plugin_card(context, result) -> str:
"""
Render the thumbnail template for an archive result.
Render the card template for an archive result.
Usage: {% plugin_thumbnail result %}
Usage: {% plugin_card result %}
Context variables passed to template:
- result: ArchiveResult object
@@ -74,46 +79,97 @@ def plugin_thumbnail(context, result) -> str:
- plugin: Plugin base name
"""
plugin = get_plugin_name(result.plugin)
template_str = get_plugin_template(plugin, 'thumbnail')
template_str = get_plugin_template(plugin, 'card')
if not template_str:
return ''
# Use embed_path() for the display path
output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
# Use embed_path() for the display path (includes canonical paths)
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
icon_html = get_plugin_icon(plugin)
output_lower = (output_path or '').lower()
text_preview_exts = ('.json', '.jsonl', '.txt', '.csv', '.tsv', '.xml', '.yml', '.yaml', '.md', '.log')
force_text_preview = output_lower.endswith(text_preview_exts)
# Create a mini template and render it with context
try:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
'output_path': output_path,
'plugin': plugin,
})
rendered = tpl.render(ctx)
# Only return non-empty content (strip whitespace to check)
if rendered.strip():
return mark_safe(rendered)
return ''
if template_str and output_path and str(output_path).strip() not in ('.', '/', './') and not force_text_preview:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
'output_path': output_path,
'plugin': plugin,
'plugin_icon': icon_html,
})
rendered = tpl.render(ctx)
# Only return non-empty content (strip whitespace to check)
if rendered.strip():
return mark_safe(rendered)
except Exception:
return ''
pass
if force_text_preview and output_path and str(output_path).strip() not in ('.', '/', './'):
output_file = Path(output_path)
if not output_file.is_absolute():
output_file = Path(result.snapshot_dir) / output_path
try:
output_file = output_file.resolve()
snap_dir = Path(result.snapshot_dir).resolve()
if snap_dir not in output_file.parents and output_file != snap_dir:
output_file = None
except Exception:
output_file = None
if output_file and output_file.exists() and output_file.is_file():
try:
with output_file.open('rb') as f:
raw = f.read(4096)
text = raw.decode('utf-8', errors='replace').strip()
if text:
lines = text.splitlines()[:6]
snippet = '\n'.join(lines)
escaped = escape(snippet)
preview = (
f'<div class="thumbnail-text" data-plugin="{plugin}" data-compact="1">'
f'<div class="thumbnail-text-header">'
f'<span class="thumbnail-compact-icon">{icon_html}</span>'
f'<span class="thumbnail-text-title">{plugin}</span>'
f'</div>'
f'<pre class="thumbnail-text-pre">{escaped}</pre>'
f'</div>'
)
return mark_safe(preview)
except Exception:
pass
if output_lower.endswith(text_preview_exts):
fallback_label = 'text'
else:
fallback_label = 'output'
fallback = (
f'<div class="thumbnail-compact" data-plugin="{plugin}" data-compact="1">'
f'<span class="thumbnail-compact-icon">{icon_html}</span>'
f'<span class="thumbnail-compact-label">{plugin}</span>'
f'<span class="thumbnail-compact-meta">{fallback_label}</span>'
f'</div>'
)
return mark_safe(fallback)
@register.simple_tag(takes_context=True)
def plugin_embed(context, result) -> str:
def plugin_full(context, result) -> str:
"""
Render the embed iframe template for an archive result.
Render the full template for an archive result.
Usage: {% plugin_embed result %}
Usage: {% plugin_full result %}
"""
plugin = get_plugin_name(result.plugin)
template_str = get_plugin_template(plugin, 'embed')
template_str = get_plugin_template(plugin, 'full')
if not template_str:
return ''
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
try:
tpl = template.Template(template_str)
@@ -132,36 +188,6 @@ def plugin_embed(context, result) -> str:
return ''
@register.simple_tag(takes_context=True)
def plugin_fullscreen(context, result) -> str:
"""
Render the fullscreen template for an archive result.
Usage: {% plugin_fullscreen result %}
"""
plugin = get_plugin_name(result.plugin)
template_str = get_plugin_template(plugin, 'fullscreen')
if not template_str:
return ''
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
try:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
'output_path': output_path,
'plugin': plugin,
})
rendered = tpl.render(ctx)
# Only return non-empty content (strip whitespace to check)
if rendered.strip():
return mark_safe(rendered)
return ''
except Exception:
return ''
@register.filter

View File

@@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView
from archivebox.misc.serve_static import serve_static
from archivebox.core.admin_site import archivebox_admin
from archivebox.core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, PublicIndexView, AddView, HealthCheckView, live_progress_view
from archivebox.workers.views import JobsDashboardView
@@ -32,6 +32,8 @@ urlpatterns = [
path('archive/', RedirectView.as_view(url='/')),
path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
re_path(r'^(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<url>https?://.*)$', SnapshotPathView.as_view(), name='snapshot-path-url'),
re_path(r'^(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<domain>[^/]+)(?:/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:/(?P<path>.*))?)?$', SnapshotPathView.as_view(), name='snapshot-path'),
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
path('add/', AddView.as_view(), name='add'),

View File

@@ -1,7 +1,6 @@
__package__ = 'archivebox.core'
import os
import sys
from django.utils import timezone
import inspect
from typing import Callable, get_type_hints
@@ -26,7 +25,7 @@ import archivebox
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode
from archivebox.misc.serve_static import serve_static_with_byterange_support
from archivebox.misc.logging_util import printable_filesize
from archivebox.search import query_search_index
@@ -52,70 +51,44 @@ class HomepageView(View):
class SnapshotView(View):
# render static html index from filesystem archive/<timestamp>/index.html
@staticmethod
def find_snapshots_for_url(path: str):
"""Return a queryset of snapshots matching a URL-ish path."""
normalized = path
if path.startswith(('http://', 'https://')):
# try exact match on full url / ID first
qs = Snapshot.objects.filter(Q(url=path) | Q(id__icontains=path))
if qs.exists():
return qs
normalized = path.split('://', 1)[1]
# try exact match on full url / ID (without scheme)
qs = Snapshot.objects.filter(
Q(url='http://' + normalized) | Q(url='https://' + normalized) | Q(id__icontains=normalized)
)
if qs.exists():
return qs
# fall back to match on exact base_url
base = base_url(normalized)
qs = Snapshot.objects.filter(
Q(url='http://' + base) | Q(url='https://' + base)
)
if qs.exists():
return qs
# fall back to matching base_url as prefix
return Snapshot.objects.filter(
Q(url__startswith='http://' + base) | Q(url__startswith='https://' + base)
)
@staticmethod
def render_live_index(request, snapshot):
TITLE_LOADING_MSG = 'Not yet archived...'
# Dict of plugin -> ArchiveResult object
archiveresult_objects = {}
# Dict of plugin -> result info dict (for template compatibility)
archiveresults = {}
results = snapshot.archiveresult_set.all()
for result in results:
embed_path = result.embed_path()
abs_path = result.snapshot_dir / (embed_path or 'None')
if (result.status == 'succeeded'
and embed_path
and os.access(abs_path, os.R_OK)
and abs_path.exists()):
if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
continue
# Store the full ArchiveResult object for template tags
archiveresult_objects[result.plugin] = result
result_info = {
'name': result.plugin,
'path': embed_path,
'ts': ts_to_date_str(result.end_ts),
'size': abs_path.stat().st_size or '?',
'result': result, # Include the full object for template tags
}
archiveresults[result.plugin] = result_info
# Use canonical_outputs for intelligent discovery
# This method now scans ArchiveResults and uses smart heuristics
canonical = snapshot.canonical_outputs()
# Add any newly discovered outputs from canonical_outputs to archiveresults
outputs = snapshot.discover_outputs()
archiveresults = {out['name']: out for out in outputs}
snap_dir = Path(snapshot.output_dir)
for key, path in canonical.items():
if not key.endswith('_path') or not path or path.startswith('http'):
continue
plugin_name = key.replace('_path', '')
if plugin_name in archiveresults:
continue # Already have this from ArchiveResult
file_path = snap_dir / path
if not file_path.exists() or not file_path.is_file():
continue
try:
file_size = file_path.stat().st_size
if file_size >= 15_000: # Only show files > 15KB
archiveresults[plugin_name] = {
'name': plugin_name,
'path': path,
'ts': ts_to_date_str(file_path.stat().st_mtime or 0),
'size': file_size,
'result': None,
}
except OSError:
continue
# Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
# Convert to base names for display ordering
@@ -131,7 +104,7 @@ class SnapshotView(View):
preferred_types = tuple(preview_priority + [p for p in all_plugins if p not in preview_priority])
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
best_result = {'path': 'None', 'result': None}
best_result = {'path': 'about:blank', 'result': None}
for result_type in preferred_types:
if result_type in archiveresults:
best_result = archiveresults[result_type]
@@ -146,7 +119,6 @@ class SnapshotView(View):
context = {
**snapshot_info,
**snapshot_info.get('canonical', {}),
'title': htmlencode(
snapshot.title
or (snapshot.base_url if snapshot.is_archived else TITLE_LOADING_MSG)
@@ -188,6 +160,14 @@ class SnapshotView(View):
try:
try:
snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
canonical_base = snapshot.url_path
if canonical_base != snapshot.legacy_archive_path:
target_path = f'/{canonical_base}/{archivefile or "index.html"}'
query = request.META.get('QUERY_STRING')
if query:
target_path = f'{target_path}?{query}'
return redirect(target_path)
if archivefile == 'index.html':
# if they requested snapshot index, serve live rendered template instead of static html
response = self.render_live_index(request, snapshot)
@@ -221,9 +201,9 @@ class SnapshotView(View):
except Snapshot.MultipleObjectsReturned:
snapshot_hrefs = mark_safe('<br/>').join(
format_html(
'{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
'{} <a href="/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
snap.bookmarked_at.strftime('%Y-%m-%d %H:%M:%S'),
snap.timestamp,
snap.archive_path,
snap.timestamp,
snap.url,
snap.title_stripped[:64] or '',
@@ -259,9 +239,9 @@ class SnapshotView(View):
#'</script>'
'</head><body>'
'<center><br/><br/><br/>'
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>'
f'Snapshot <a href="/{snapshot.archive_path}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>'
f'was queued on {str(snapshot.bookmarked_at).split(".")[0]}, '
f'but no files have been saved yet in:<br/><b><a href="/archive/{snapshot.timestamp}/" target="_top"><code>{snapshot.timestamp}</code></a><code>/'
f'but no files have been saved yet in:<br/><b><a href="/{snapshot.archive_path}/" target="_top"><code>{snapshot.timestamp}</code></a><code>/'
'{}'
f'</code></b><br/><br/>'
'It\'s possible {} '
@@ -270,8 +250,8 @@ class SnapshotView(View):
f'<code style="user-select: all; color: #333">archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
'<div class="text-align: left; width: 100%; max-width: 400px">'
'<i><b>Next steps:</i></b><br/>'
f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
f'- list all the <a href="/{snapshot.archive_path}/" target="_top">Snapshot files <code>.*</code></a><br/>'
f'- view the <a href="/{snapshot.archive_path}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
f'- go to the <a href="/admin/core/snapshot/{snapshot.pk}/change/" target="_top">Snapshot admin</a> to edit<br/>'
f'- go to the <a href="/admin/core/snapshot/?id__exact={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
'- or return to <a href="/" target="_top">the main index...</a></div>'
@@ -288,22 +268,9 @@ class SnapshotView(View):
# slug is a URL
try:
try:
# try exact match on full url / ID first
snapshot = Snapshot.objects.get(
Q(url='http://' + path) | Q(url='https://' + path) | Q(id__icontains=path)
)
snapshot = SnapshotView.find_snapshots_for_url(path).get()
except Snapshot.DoesNotExist:
# fall back to match on exact base_url
try:
snapshot = Snapshot.objects.get(
Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path))
)
except Snapshot.DoesNotExist:
# fall back to matching base_url as prefix
snapshot = Snapshot.objects.get(
Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
)
return redirect(f'/archive/{snapshot.timestamp}/index.html')
raise
except Snapshot.DoesNotExist:
return HttpResponse(
format_html(
@@ -322,20 +289,18 @@ class SnapshotView(View):
status=404,
)
except Snapshot.MultipleObjectsReturned:
snapshots = SnapshotView.find_snapshots_for_url(path)
snapshot_hrefs = mark_safe('<br/>').join(
format_html(
'{} <code style="font-size: 0.8em">{}</code> <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
'{} <code style="font-size: 0.8em">{}</code> <a href="/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
snap.bookmarked_at.strftime('%Y-%m-%d %H:%M:%S'),
str(snap.id)[:8],
snap.timestamp,
snap.archive_path,
snap.timestamp,
snap.url,
snap.title_stripped[:64] or '',
)
for snap in Snapshot.objects.filter(
Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
| Q(id__icontains=path)
).only('url', 'timestamp', 'title', 'bookmarked_at').order_by('-bookmarked_at')
for snap in snapshots.only('url', 'timestamp', 'title', 'bookmarked_at').order_by('-bookmarked_at')
)
return HttpResponse(
format_html(
@@ -353,6 +318,108 @@ class SnapshotView(View):
status=404,
)
target_path = f'/{snapshot.archive_path}/index.html'
query = request.META.get('QUERY_STRING')
if query:
target_path = f'{target_path}?{query}'
return redirect(target_path)
class SnapshotPathView(View):
"""Serve snapshots by the new URL scheme: /<username>/<YYYYMMDD>/<domain>/<uuid>/..."""
def get(self, request, username: str, date: str, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
if username == 'system':
return redirect(request.path.replace('/system/', '/web/', 1))
requested_url = url
if not requested_url and domain and domain.startswith(('http://', 'https://')):
requested_url = domain
snapshot = None
if snapshot_id:
try:
snapshot = Snapshot.objects.get(pk=snapshot_id)
except Snapshot.DoesNotExist:
try:
snapshot = Snapshot.objects.get(id__startswith=snapshot_id)
except Snapshot.DoesNotExist:
snapshot = None
except Snapshot.MultipleObjectsReturned:
snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first()
else:
# fuzzy lookup by date + domain/url (most recent)
username_lookup = 'system' if username == 'web' else username
if requested_url:
qs = SnapshotView.find_snapshots_for_url(requested_url).filter(crawl__created_by__username=username_lookup)
else:
qs = Snapshot.objects.filter(crawl__created_by__username=username_lookup)
try:
if len(date) == 4:
qs = qs.filter(created_at__year=int(date))
elif len(date) == 6:
qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6]))
elif len(date) == 8:
qs = qs.filter(
created_at__year=int(date[:4]),
created_at__month=int(date[4:6]),
created_at__day=int(date[6:8]),
)
except ValueError:
pass
if requested_url:
snapshot = qs.order_by('-created_at', '-bookmarked_at', '-timestamp').first()
else:
requested_domain = domain or ''
if requested_domain.startswith(('http://', 'https://')):
requested_domain = Snapshot.extract_domain_from_url(requested_domain)
else:
requested_domain = Snapshot.extract_domain_from_url(f'https://{requested_domain}')
# Prefer exact domain matches
matches = [s for s in qs.order_by('-created_at', '-bookmarked_at') if Snapshot.extract_domain_from_url(s.url) == requested_domain]
snapshot = matches[0] if matches else qs.order_by('-created_at', '-bookmarked_at', '-timestamp').first()
if not snapshot:
return HttpResponse(
format_html(
(
'<center><br/><br/><br/>'
'No Snapshots match the given id or url: <code>{}</code><br/><br/><br/>'
'Return to the <a href="/" target="_top">Main Index</a>'
'</center>'
),
snapshot_id or requested_url or domain,
),
content_type="text/html",
status=404,
)
canonical_base = snapshot.url_path
requested_base = f'{username}/{date}/{domain or url or ""}'
if snapshot_id:
requested_base = f'{requested_base}/{snapshot_id}'
if canonical_base != requested_base:
target = f'/{canonical_base}/{path or "index.html"}'
query = request.META.get('QUERY_STRING')
if query:
target = f'{target}?{query}'
return redirect(target)
archivefile = path or "index.html"
if archivefile == "index.html":
return SnapshotView.render_live_index(request, snapshot)
return serve_static_with_byterange_support(
request, archivefile, document_root=snapshot.output_dir, show_indexes=True,
)
class PublicIndexView(ListView):
template_name = 'public_index.html'
@@ -592,7 +659,7 @@ def live_progress_view(request):
'snapshot_id': str(ar.snapshot_id),
'snapshot_url': ar.snapshot.url[:60] if ar.snapshot else '',
'embed_path': embed,
'archive_path': f'/archive/{ar.snapshot.timestamp}/{embed}' if ar.snapshot else '',
'archive_path': f'/{ar.snapshot.archive_path}/{embed}' if ar.snapshot else '',
'end_ts': ar.end_ts.isoformat() if ar.end_ts else None,
})

View File

@@ -71,8 +71,8 @@ def render_snapshots_list(snapshots_qs, limit=20):
color: {color}; background: {bg};">{status}</span>
</td>
<td style="padding: 6px 8px; white-space: nowrap;">
<a href="/archive/{snapshot.timestamp}/" style="text-decoration: none;">
<img src="/archive/{snapshot.timestamp}/favicon.ico"
<a href="/{snapshot.archive_path}/" style="text-decoration: none;">
<img src="/{snapshot.archive_path}/favicon.ico"
style="width: 16px; height: 16px; vertical-align: middle; margin-right: 4px;"
onerror="this.style.display='none'"/>
</a>

View File

@@ -940,9 +940,8 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
# archivebox/plugins/<plugin_name>/
# templates/
# icon.html # Icon for admin table view (small inline HTML)
# thumbnail.html # Preview thumbnail for snapshot cards
# embed.html # Iframe embed content for main preview
# fullscreen.html # Fullscreen view template
# card.html # Preview card for snapshot header
# full.html # Fullscreen view template
#
# Template context variables available:
# {{ result }} - ArchiveResult object
@@ -953,21 +952,22 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
# Default templates used when plugin doesn't provide one
DEFAULT_TEMPLATES = {
'icon': '''<span title="{{ plugin }}">{{ icon }}</span>''',
'thumbnail': '''
<img src="{{ output_path }}"
alt="{{ plugin }} output"
style="max-width: 100%; max-height: 100px; object-fit: cover;"
onerror="this.style.display='none'">
'icon': '''
<span title="{{ plugin }}" style="display:inline-flex; width:20px; height:20px; align-items:center; justify-content:center;">
{{ icon }}
</span>
''',
'embed': '''
'card': '''
<iframe src="{{ output_path }}"
class="card-img-top"
style="width: 100%; height: 100%; border: none;"
sandbox="allow-same-origin allow-scripts">
sandbox="allow-same-origin allow-scripts allow-forms"
loading="lazy">
</iframe>
''',
'fullscreen': '''
'full': '''
<iframe src="{{ output_path }}"
class="full-page-iframe"
style="width: 100%; height: 100vh; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms">
</iframe>
@@ -981,7 +981,7 @@ def get_plugin_template(plugin: str, template_name: str, fallback: bool = True)
Args:
plugin: Plugin name (e.g., 'screenshot', '15_singlefile')
template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen'
template_name: One of 'icon', 'card', 'full'
fallback: If True, return default template if plugin template not found
Returns:
@@ -1050,7 +1050,7 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
Returns:
Dict mapping plugin names to dicts of template_name -> template_path.
e.g., {'screenshot': {'icon': '/path/to/icon.html', 'thumbnail': '/path/to/thumbnail.html'}}
e.g., {'screenshot': {'icon': '/path/to/icon.html', 'card': '/path/to/card.html'}}
"""
templates: Dict[str, Dict[str, str]] = {}
@@ -1068,7 +1068,7 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
plugin_templates = {}
for template_file in templates_dir.glob('*.html'):
template_name = template_file.stem # icon, thumbnail, embed, fullscreen
template_name = template_file.stem # icon, card, full
plugin_templates[template_name] = str(template_file)
if plugin_templates:

View File

@@ -237,8 +237,8 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
sha256 = models.CharField(max_length=64, default='', null=False, blank=True)
# State machine fields
status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True)
retry_at = models.DateTimeField(default=timezone.now, null=True, blank=True, db_index=True,
status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED, max_length=16)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now,
help_text="When to retry this binary installation")
# Health stats
@@ -246,6 +246,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
num_uses_succeeded = models.PositiveIntegerField(default=0)
state_machine_name: str = 'archivebox.machine.models.BinaryMachine'
active_state: str = StatusChoices.QUEUED
objects: BinaryManager = BinaryManager()

View File

@@ -49,6 +49,111 @@ const OUTPUT_DIR = '.';
let chromePid = null;
let browserInstance = null;
function parseCookiesTxt(contents) {
const cookies = [];
let skipped = 0;
for (const rawLine of contents.split(/\r?\n/)) {
const line = rawLine.trim();
if (!line) continue;
let httpOnly = false;
let dataLine = line;
if (dataLine.startsWith('#HttpOnly_')) {
httpOnly = true;
dataLine = dataLine.slice('#HttpOnly_'.length);
} else if (dataLine.startsWith('#')) {
continue;
}
const parts = dataLine.split('\t');
if (parts.length < 7) {
skipped += 1;
continue;
}
const [domainRaw, includeSubdomainsRaw, pathRaw, secureRaw, expiryRaw, name, value] = parts;
if (!name || !domainRaw) {
skipped += 1;
continue;
}
const includeSubdomains = (includeSubdomainsRaw || '').toUpperCase() === 'TRUE';
let domain = domainRaw;
if (includeSubdomains && !domain.startsWith('.')) domain = `.${domain}`;
if (!includeSubdomains && domain.startsWith('.')) domain = domain.slice(1);
const cookie = {
name,
value,
domain,
path: pathRaw || '/',
secure: (secureRaw || '').toUpperCase() === 'TRUE',
httpOnly,
};
const expires = parseInt(expiryRaw, 10);
if (!isNaN(expires) && expires > 0) {
cookie.expires = expires;
}
cookies.push(cookie);
}
return { cookies, skipped };
}
async function importCookiesFromFile(browser, cookiesFile, userDataDir) {
if (!cookiesFile) return;
if (!fs.existsSync(cookiesFile)) {
console.error(`[!] Cookies file not found: ${cookiesFile}`);
return;
}
let contents = '';
try {
contents = fs.readFileSync(cookiesFile, 'utf-8');
} catch (e) {
console.error(`[!] Failed to read COOKIES_TXT_FILE: ${e.message}`);
return;
}
const { cookies, skipped } = parseCookiesTxt(contents);
if (cookies.length === 0) {
console.error('[!] No cookies found to import');
return;
}
console.error(`[*] Importing ${cookies.length} cookies from ${cookiesFile}...`);
if (skipped) {
console.error(`[*] Skipped ${skipped} malformed cookie line(s)`);
}
if (!userDataDir) {
console.error('[!] CHROME_USER_DATA_DIR not set; cookies will not persist beyond this session');
}
const page = await browser.newPage();
const client = await page.target().createCDPSession();
await client.send('Network.enable');
const chunkSize = 200;
let imported = 0;
for (let i = 0; i < cookies.length; i += chunkSize) {
const chunk = cookies.slice(i, i + chunkSize);
try {
await client.send('Network.setCookies', { cookies: chunk });
imported += chunk.length;
} catch (e) {
console.error(`[!] Failed to import cookies ${i + 1}-${i + chunk.length}: ${e.message}`);
}
}
await page.close();
console.error(`[+] Imported ${imported}/${cookies.length} cookies`);
}
// Parse command line arguments
function parseArgs() {
const args = {};
@@ -118,10 +223,14 @@ async function main() {
// Load installed extensions
const extensionsDir = getExtensionsDir();
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
const cookiesFile = getEnv('COOKIES_TXT_FILE') || getEnv('COOKIES_FILE');
if (userDataDir) {
console.error(`[*] Using user data dir: ${userDataDir}`);
}
if (cookiesFile) {
console.error(`[*] Using cookies file: ${cookiesFile}`);
}
const installedExtensions = [];
const extensionPaths = [];
@@ -179,6 +288,9 @@ async function main() {
});
browserInstance = browser;
// Import cookies into Chrome profile at crawl start
await importCookiesFromFile(browser, cookiesFile, userDataDir);
// Get actual extension IDs from chrome://extensions page
if (extensionPaths.length > 0) {
await new Promise(r => setTimeout(r, 2000));

View File

@@ -38,6 +38,82 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
CHROME_NAVIGATE_HOOK,
)
def _get_cookies_via_cdp(port: int, env: dict) -> list[dict]:
node_script = r"""
const http = require('http');
const WebSocket = require('ws');
const port = process.env.CDP_PORT;
function getTargets() {
return new Promise((resolve, reject) => {
const req = http.get(`http://127.0.0.1:${port}/json/list`, (res) => {
let data = '';
res.on('data', (chunk) => (data += chunk));
res.on('end', () => {
try {
resolve(JSON.parse(data));
} catch (e) {
reject(e);
}
});
});
req.on('error', reject);
});
}
(async () => {
const targets = await getTargets();
const pageTarget = targets.find(t => t.type === 'page') || targets[0];
if (!pageTarget) {
console.error('No page target found');
process.exit(2);
}
const ws = new WebSocket(pageTarget.webSocketDebuggerUrl);
const timer = setTimeout(() => {
console.error('Timeout waiting for cookies');
process.exit(3);
}, 10000);
ws.on('open', () => {
ws.send(JSON.stringify({ id: 1, method: 'Network.getAllCookies' }));
});
ws.on('message', (data) => {
const msg = JSON.parse(data);
if (msg.id === 1) {
clearTimeout(timer);
ws.close();
if (!msg.result || !msg.result.cookies) {
console.error('No cookies in response');
process.exit(4);
}
process.stdout.write(JSON.stringify(msg.result.cookies));
process.exit(0);
}
});
ws.on('error', (err) => {
console.error(String(err));
process.exit(5);
});
})().catch((err) => {
console.error(String(err));
process.exit(1);
});
"""
result = subprocess.run(
['node', '-e', node_script],
capture_output=True,
text=True,
timeout=30,
env=env | {'CDP_PORT': str(port)},
)
assert result.returncode == 0, f"Failed to read cookies via CDP: {result.stderr}\nStdout: {result.stdout}"
return json.loads(result.stdout or '[]')
@pytest.fixture(scope="session", autouse=True)
def ensure_chromium_and_puppeteer_installed(tmp_path_factory):
"""Ensure Chromium and puppeteer are installed before running tests."""
@@ -197,6 +273,77 @@ def test_chrome_launch_and_tab_creation():
pass
def test_cookies_imported_on_launch():
"""Integration test: COOKIES_TXT_FILE is imported at crawl start."""
with tempfile.TemporaryDirectory() as tmpdir:
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
cookies_file = Path(tmpdir) / 'cookies.txt'
cookies_file.write_text(
'\n'.join([
'# Netscape HTTP Cookie File',
'# https://curl.se/docs/http-cookies.html',
'# This file was generated by a test',
'',
'example.com\tTRUE\t/\tFALSE\t2147483647\tabx_test_cookie\thello',
'',
])
)
profile_dir = Path(tmpdir) / 'profile'
env = get_test_env()
env.update({
'CHROME_HEADLESS': 'true',
'CHROME_USER_DATA_DIR': str(profile_dir),
'COOKIES_TXT_FILE': str(cookies_file),
})
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-cookies'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
for _ in range(15):
if (chrome_dir / 'port.txt').exists():
break
time.sleep(1)
assert (chrome_dir / 'port.txt').exists(), "port.txt should exist"
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
port = int((chrome_dir / 'port.txt').read_text().strip())
cookie_found = False
for _ in range(15):
cookies = _get_cookies_via_cdp(port, env)
cookie_found = any(
c.get('name') == 'abx_test_cookie' and c.get('value') == 'hello'
for c in cookies
)
if cookie_found:
break
time.sleep(1)
assert cookie_found, "Imported cookie should be present in Chrome session"
# Cleanup
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
def test_chrome_navigation():
"""Integration test: Navigate to a URL."""
with tempfile.TemporaryDirectory() as tmpdir:

View File

@@ -1,40 +0,0 @@
<!-- Embedded forum view - renders JSONL forum posts -->
<div class="extractor-embed forumdl-embed" style="width: 100%; max-width: 900px; margin: 0 auto; background: #1a1a1a; padding: 20px; border-radius: 8px;">
<div style="text-align: center; padding: 15px 0; border-bottom: 1px solid #333; margin-bottom: 20px;">
<span style="font-size: 32px;">💬</span>
<h3 style="margin: 10px 0; color: #fff; font-size: 18px;">Forum Thread</h3>
</div>
<div id="forum-posts" style="max-height: 500px; overflow-y: auto; color: #ddd;"></div>
<script>
(async function() {
try {
const response = await fetch('{{ output_path }}');
const text = await response.text();
const posts = text.trim().split('\n').map(line => JSON.parse(line));
const container = document.getElementById('forum-posts');
posts.forEach(post => {
const postDiv = document.createElement('div');
postDiv.style.cssText = 'background: #2a2a2a; padding: 15px; margin-bottom: 15px; border-radius: 5px; border-left: 3px solid #4a9eff;';
const author = post.author || 'Anonymous';
const date = post.date ? new Date(post.date).toLocaleString() : '';
const title = post.title || '';
const content = post.content || post.body || '';
postDiv.innerHTML = `
<div style="display: flex; justify-content: space-between; margin-bottom: 10px; padding-bottom: 8px; border-bottom: 1px solid #444;">
<strong style="color: #4a9eff;">${author}</strong>
<span style="color: #888; font-size: 12px;">${date}</span>
</div>
${title ? `<h4 style="margin: 0 0 10px 0; color: #fff;">${title}</h4>` : ''}
<div style="color: #ccc; line-height: 1.5;">${content}</div>
`;
container.appendChild(postDiv);
});
} catch(e) {
document.getElementById('forum-posts').innerHTML = '<p style="color: #888;">Error loading forum posts</p>';
}
})();
</script>
</div>

View File

@@ -1,11 +0,0 @@
<!-- Embedded gallery view - shows first image with link to full gallery -->
<div class="extractor-embed gallerydl-embed" style="width: 100%; max-width: 800px; margin: 0 auto; background: #1a1a1a; padding: 20px;">
<img src="{{ output_path }}"
style="width: 100%; max-height: 600px; object-fit: contain;"
alt="Gallery image"
onerror="this.style.display='none'; this.nextElementSibling.style.display='flex';">
<div style="display: none; flex-direction: column; align-items: center; color: #888; padding: 40px;">
<span style="font-size: 64px;">🖼️</span>
<span style="margin-top: 10px;">Gallery downloaded</span>
</div>
</div>

View File

@@ -1,15 +0,0 @@
<!-- Embedded paper view - shows PDF viewer -->
<div class="extractor-embed papersdl-embed" style="width: 100%; max-width: 900px; margin: 0 auto; background: #1a1a1a; padding: 20px; border-radius: 8px;">
<div style="text-align: center; padding: 15px 0; border-bottom: 1px solid #333; margin-bottom: 20px;">
<span style="font-size: 32px;">📄</span>
<h3 style="margin: 10px 0; color: #fff; font-size: 18px;">Scientific Paper</h3>
</div>
<div style="width: 100%; height: 500px; background: #2a2a2a; border-radius: 5px; overflow: hidden;">
<embed src="{{ output_path }}" type="application/pdf" width="100%" height="100%" />
</div>
<div style="margin-top: 15px; text-align: center;">
<a href="{{ output_path }}" download style="color: #4a9eff; text-decoration: none; padding: 10px 20px; background: #2a2a2a; border-radius: 5px; display: inline-block;">
Download PDF
</a>
</div>
</div>

View File

@@ -271,12 +271,11 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
records.append(record)
print(json.dumps(record))
if records:
URLS_FILE.write_text('\n'.join(json.dumps(r) for r in records) + '\n')
URLS_FILE.write_text('\n'.join(json.dumps(r) for r in records) + ('\n' if records else ''))
# Emit ArchiveResult record to mark completion
status = 'succeeded' if urls_found else 'skipped'
output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
output_str = URLS_FILE.name
ar_record = {
'type': 'ArchiveResult',
'status': status,

View File

@@ -57,7 +57,7 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
assert 'Found 3 URLs' in result.stderr
assert 'urls.jsonl' in result.stderr
# Parse Snapshot records from stdout
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
@@ -78,6 +78,11 @@ class TestParseHtmlUrls:
assert '"type": "ArchiveResult"' in result.stdout
assert '"status": "succeeded"' in result.stdout
urls_file = tmp_path / 'urls.jsonl'
assert urls_file.exists(), "urls.jsonl not created"
file_lines = [line for line in urls_file.read_text().splitlines() if line.strip()]
assert len(file_lines) == 3, f"Expected 3 urls.jsonl entries, got {len(file_lines)}"
def test_ignores_non_http_schemes(self, tmp_path):
"""Test that non-http schemes are ignored."""
input_file = tmp_path / 'page.html'
@@ -194,7 +199,7 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
assert 'No URLs found' in result.stderr
assert 'urls.jsonl' in result.stderr
assert '"status": "skipped"' in result.stdout
def test_handles_malformed_html(self, tmp_path):

View File

@@ -18,6 +18,7 @@ Supports various field names for URL, title, timestamp, and tags.
import json
import os
import sys
from pathlib import Path
from datetime import datetime
from html import unescape
from urllib.parse import urlparse
@@ -25,6 +26,7 @@ from urllib.parse import urlparse
import rich_click as click
PLUGIN_NAME = 'parse_jsonl_urls'
URLS_FILE = Path('urls.jsonl')
def parse_bookmarked_at(link: dict) -> str | None:
@@ -188,9 +190,12 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
for entry in urls_found:
print(json.dumps(entry))
# Write urls.jsonl to disk for crawl system
URLS_FILE.write_text('\n'.join(json.dumps(r) for r in urls_found) + ('\n' if urls_found else ''))
# Emit ArchiveResult record to mark completion
status = 'succeeded' if urls_found else 'skipped'
output_str = f'Found {len(urls_found)} URLs, {len(all_tags)} tags' if urls_found else 'No URLs found'
output_str = URLS_FILE.name
ar_record = {
'type': 'ArchiveResult',
'status': status,

View File

@@ -32,7 +32,7 @@ class TestParseJsonlUrls:
)
assert result.returncode == 0
assert 'Found 3 URLs' in result.stdout
assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
@@ -195,7 +195,7 @@ class TestParseJsonlUrls:
)
assert result.returncode == 0
assert 'No URLs found' in result.stderr
assert 'urls.jsonl' in result.stderr
assert '"status": "skipped"' in result.stdout
def test_exits_1_when_file_not_found(self, tmp_path):

View File

@@ -16,6 +16,7 @@ import json
import os
import re
import sys
from pathlib import Path
from datetime import datetime, timezone
from html import unescape
from urllib.parse import urlparse
@@ -23,6 +24,7 @@ from urllib.parse import urlparse
import rich_click as click
PLUGIN_NAME = 'parse_netscape_urls'
URLS_FILE = Path('urls.jsonl')
# Constants for timestamp epoch detection
UNIX_EPOCH = 0 # 1970-01-01 00:00:00 UTC
@@ -232,9 +234,12 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
for entry in urls_found:
print(json.dumps(entry))
# Write urls.jsonl to disk for crawl system
URLS_FILE.write_text('\n'.join(json.dumps(r) for r in urls_found) + ('\n' if urls_found else ''))
# Emit ArchiveResult record to mark completion
status = 'succeeded' if urls_found else 'skipped'
output_str = f'Found {len(urls_found)} URLs, {len(all_tags)} tags' if urls_found else 'No bookmarks found'
output_str = URLS_FILE.name
ar_record = {
'type': 'ArchiveResult',
'status': status,

View File

@@ -37,7 +37,7 @@ class TestParseNetscapeUrls:
)
assert result.returncode == 0
assert 'Found 3 URLs' in result.stdout
assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
@@ -135,7 +135,7 @@ class TestParseNetscapeUrls:
)
assert result.returncode == 0
assert 'No bookmarks found' in result.stderr
assert 'urls.jsonl' in result.stderr
assert '"status": "skipped"' in result.stdout
def test_exits_1_when_file_not_found(self, tmp_path):

View File

@@ -935,7 +935,7 @@ class TestEdgeCases:
)
assert result.returncode == 0
assert 'Found 1000 URLs' in result.stdout
assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout
# Output goes to stdout (JSONL) - get all JSONL records
all_lines = [line for line in result.stdout.strip().split('\n') if line.strip() and line.startswith('{')]

View File

@@ -16,6 +16,7 @@ Examples:
import json
import os
import sys
from pathlib import Path
from datetime import datetime, timezone
from html import unescape
from time import mktime
@@ -24,6 +25,7 @@ from urllib.parse import urlparse
import rich_click as click
PLUGIN_NAME = 'parse_rss_urls'
URLS_FILE = Path('urls.jsonl')
try:
import feedparser
@@ -140,9 +142,12 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
for entry in urls_found:
print(json.dumps(entry))
# Write urls.jsonl to disk for crawl system
URLS_FILE.write_text('\n'.join(json.dumps(r) for r in urls_found) + ('\n' if urls_found else ''))
# Emit ArchiveResult record to mark completion
status = 'succeeded' if urls_found else 'skipped'
output_str = f'Found {len(urls_found)} URLs, {len(all_tags)} tags' if urls_found else 'No URLs found'
output_str = URLS_FILE.name
ar_record = {
'type': 'ArchiveResult',
'status': status,

View File

@@ -66,7 +66,7 @@ class TestParseRssUrls:
)
assert result.returncode == 0
assert 'Found 2 URLs' in result.stdout
assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
@@ -134,7 +134,7 @@ class TestParseRssUrls:
)
assert result.returncode == 0
assert 'No URLs found' in result.stderr
assert 'urls.jsonl' in result.stderr
assert '"status": "skipped"' in result.stdout
def test_exits_1_when_file_not_found(self, tmp_path):

View File

@@ -882,7 +882,7 @@ class TestEdgeCases:
)
assert result.returncode == 0
assert 'Found 100 URLs' in result.stdout
assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]

View File

@@ -26,6 +26,7 @@ from urllib.request import urlopen
import rich_click as click
PLUGIN_NAME = 'parse_txt_urls'
URLS_FILE = Path('urls.jsonl')
# URL regex from archivebox/misc/util.py
# https://mathiasbynens.be/demo/url-regex
@@ -127,6 +128,7 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
urls_found.add(cleaned_url)
# Emit Snapshot records to stdout (JSONL)
records = []
for found_url in sorted(urls_found):
record = {
'type': 'Snapshot',
@@ -138,11 +140,13 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
record['parent_snapshot_id'] = snapshot_id
if crawl_id:
record['crawl_id'] = crawl_id
records.append(record)
print(json.dumps(record))
# Emit ArchiveResult record to mark completion
URLS_FILE.write_text('\n'.join(json.dumps(r) for r in records) + ('\n' if records else ''))
status = 'succeeded' if urls_found else 'skipped'
output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
output_str = URLS_FILE.name
ar_record = {
'type': 'ArchiveResult',
'status': status,

View File

@@ -32,7 +32,7 @@ https://www.iana.org/domains/reserved
)
assert result.returncode == 0, f"Failed: {result.stderr}"
assert 'Found 3 URLs' in result.stderr
assert 'urls.jsonl' in result.stderr
# Parse Snapshot records from stdout
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
@@ -113,7 +113,7 @@ Also see https://github.com/user/repo for the code.
)
assert result.returncode == 0
assert 'No URLs found' in result.stderr
assert 'urls.jsonl' in result.stderr
assert '"status": "skipped"' in result.stdout
def test_exits_1_when_file_not_found(self, tmp_path):

View File

@@ -1,5 +0,0 @@
<!-- PDF embed - full PDF viewer -->
<embed src="{{ output_path }}#toolbar=1&navpanes=1"
type="application/pdf"
class="extractor-embed pdf-embed"
style="width: 100%; height: 100%; min-height: 500px;">

View File

@@ -31,6 +31,7 @@ PLUGIN_NAME = 'readability'
BIN_NAME = 'readability-extractor'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'content.html'
def get_env(name: str, default: str = '') -> str:
@@ -130,11 +131,11 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
if not text_content and not html_content:
return False, None, 'No content extracted'
(output_dir / 'content.html').write_text(html_content, encoding='utf-8')
(output_dir / OUTPUT_FILE).write_text(html_content, encoding='utf-8')
(output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
(output_dir / 'article.json').write_text(json.dumps(result_json, indent=2), encoding='utf-8')
return True, OUTPUT_DIR, ''
return True, OUTPUT_FILE, ''
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'

View File

@@ -0,0 +1,6 @@
<!-- Readability fullscreen - show extracted article HTML -->
<iframe class="full-page-iframe"
src="{{ output_path }}"
name="preview"
sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
</iframe>

View File

@@ -1,5 +0,0 @@
<!-- Screenshot embed - full image view -->
<img src="{{ output_path }}"
alt="Screenshot of page"
class="extractor-embed screenshot-embed"
style="max-width: 100%; height: auto;">

View File

@@ -263,7 +263,7 @@ async function main() {
const archiveResult = {
type: 'ArchiveResult',
status,
output_str: extractedTitle || error || '',
output_str: output || error || '',
};
console.log(JSON.stringify(archiveResult));

View File

@@ -1,9 +0,0 @@
<!-- YT-DLP embed - video/audio player -->
<div class="extractor-embed ytdlp-embed" style="width: 100%; height: 100%; min-height: 400px; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
<video src="{{ output_path }}"
style="max-width: 100%; max-height: 100%;"
controls
preload="metadata">
Your browser does not support the video tag.
</video>
</div>

View File

@@ -6,12 +6,12 @@
</td>
<td class="title-col" style="opacity: {% if link.title %}1{% else %}0.3{% endif %}" title="{{link.title|default:'Not yet archived...'}}">
{% if link.is_archived %}
<a href="/archive/{{link.timestamp}}/index.html"><img src="/archive/{{link.timestamp}}/favicon.ico" onerror="this.style.display='none'" class="link-favicon" decoding="async"></a>
<a href="/{{link.archive_path}}/index.html"><img src="/{{link.archive_path}}/favicon.ico" onerror="this.style.display='none'" class="link-favicon" decoding="async"></a>
{% else %}
<a href="/archive/{{link.timestamp}}/index.html"><img src="{% static 'spinner.gif' %}" onerror="this.style.display='none'" class="link-favicon" decoding="async" style="height: 15px"></a>
<a href="/{{link.archive_path}}/index.html"><img src="{% static 'spinner.gif' %}" onerror="this.style.display='none'" class="link-favicon" decoding="async" style="height: 15px"></a>
{% endif %}
<a href="/archive/{{link.timestamp}}/index.html" title="{{link.title|default:'Not yet archived...'}}">
<a href="/{{link.archive_path}}/index.html" title="{{link.title|default:'Not yet archived...'}}">
<span data-title-for="{{link.url}}" data-archived="{{link.is_archived}}">
{{link.title|default:'Loading...'|truncatechars:128}}
</span>
@@ -29,7 +29,7 @@
{% if link.icons %}
{{link.icons}}&nbsp; <small style="float:right; opacity: 0.5">{{link.num_outputs}}</small>
{% else %}
<a href="/archive/{{link.timestamp}}/index.html">
<a href="/{{link.archive_path}}/index.html">
📄 &nbsp;
{{link.num_outputs}} <img src="{% static 'spinner.gif' %}" onerror="this.style.display='none'" class="files-spinner" decoding="async" style="height: 15px"/>
</a>

View File

@@ -113,6 +113,10 @@
border-radius: 10px;
background-color: black;
overflow: hidden;
min-height: 130px;
}
.header-bottom-frames .card:has([data-compact]) {
min-height: 0;
}
.card h4 {
font-size: 1.4vw;
@@ -154,6 +158,202 @@
transform: scale(0.25);
transform-origin: 0 0;
}
.row.header-bottom-frames {
display: block !important;
width: 100%;
max-width: 100%;
column-width: 180px;
column-gap: 8px;
column-fill: auto;
margin-left: 0px;
margin-right: 0px;
flex: none !important;
}
.header-bottom-frames .col-lg-2 {
padding-left: 0px;
padding-right: 0px;
max-width: 100%;
width: 100% !important;
display: inline-block !important;
float: none !important;
flex: none !important;
break-inside: avoid;
margin-bottom: 6px;
vertical-align: top;
}
.header-bottom-frames .card:has([data-compact]) .thumbnail-wrapper,
.header-bottom-frames .card:has([data-compact]) .thumbnail-wrapper.compact {
height: 32px;
}
.header-bottom-frames .card:has([data-compact]) .thumbnail-text {
height: auto;
max-height: 64px;
}
.header-bottom-frames .card:has([data-compact]) .card-body {
padding: 4px 8px;
max-height: 44px;
}
.thumbnail-wrapper {
height: 100px;
overflow: hidden;
background: #333;
}
.thumbnail-compact {
height: 32px;
display: flex;
align-items: center;
gap: 6px;
padding: 0 8px;
font-size: 13px;
line-height: 1;
color: #bdbdbd;
background: #111;
border-bottom: 1px solid #222;
text-transform: uppercase;
letter-spacing: 0.02em;
}
.thumbnail-compact-label {
color: #e1e1e1;
}
.thumbnail-compact-meta {
color: #777;
font-size: 11px;
margin-left: auto;
}
.thumbnail-compact svg,
.thumbnail-compact img {
height: 12px;
width: 12px;
}
.thumbnail-text {
height: 100px;
background: #121212;
color: #d8d8d8;
padding: 6px 8px;
display: flex;
flex-direction: column;
gap: 4px;
font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
font-size: 11px;
line-height: 1.2;
overflow: hidden;
}
.thumbnail-text-header {
display: flex;
align-items: center;
gap: 6px;
font-size: 10px;
color: #9b9b9b;
text-transform: uppercase;
letter-spacing: 0.04em;
}
.thumbnail-text-pre {
margin: 0;
white-space: pre-wrap;
overflow: hidden;
display: -webkit-box;
-webkit-box-orient: vertical;
-webkit-line-clamp: 5;
}
.thumbnail-text[data-plugin="title"] .thumbnail-text-pre {
font-size: 13px;
font-weight: 600;
-webkit-line-clamp: 3;
}
.thumb-grid {
display: block;
column-width: 180px;
column-gap: 6px;
align-content: start;
width: 100%;
}
.thumb-card {
box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
border: 1px solid rgba(0,0,0,3);
border-radius: 10px;
background-color: black;
overflow: hidden;
display: inline-block;
width: 100%;
break-inside: avoid;
box-sizing: border-box;
margin-bottom: 6px;
height: 138px;
min-height: 138px;
max-height: 138px;
}
.thumb-card:has([data-compact]) {
height: 46px;
min-height: 46px;
max-height: 46px;
}
.thumb-card .thumb-body {
font-size: 14px;
padding: 3px 8px;
line-height: 1.2;
word-wrap: break-word;
overflow: hidden;
text-overflow: ellipsis;
background-color: #1a1a1a;
color: #d3d3d3;
}
.thumb-card .thumb-body h4 {
font-size: 1.1em;
margin: 0 0 2px 0;
line-height: 1.1;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.thumb-card .thumbnail-wrapper,
.thumb-card iframe.card-img-top {
display: block;
width: 100%;
}
.thumb-card:has([data-compact]) .thumbnail-wrapper,
.thumb-card:has([data-compact]) .thumbnail-wrapper.compact {
height: 24px;
flex: 0 0 auto;
}
.thumb-card:has([data-compact]) .thumb-body {
padding: 2px 6px;
font-size: 12px;
max-height: 20px;
}
.thumb-card:has([data-compact]) .thumb-body h4 {
font-size: 0.9em;
margin-bottom: 0px;
line-height: 1;
display: flex;
align-items: center;
gap: 4px;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.thumb-card.selected-card {
border: 2px solid orange;
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
}
.thumb-compact .thumbnail-wrapper {
height: 32px;
}
.thumb-compact {
margin-bottom: 0px;
border-radius: 6px;
}
.thumb-compact .card-body {
display: block;
padding: 4px 8px;
font-size: 12px;
line-height: 1.2;
max-height: none;
}
.thumb-compact .thumbnail-compact,
.thumb-compact .thumbnail-text {
height: 32px;
max-height: 32px;
}
.full-page-iframe {
border-top: 1px solid #ddd;
width: 100%;
@@ -203,6 +403,10 @@
box-shadow: 4px 4px 4px rgba(0,0,0,0.2);
margin-top: 0px;
}
.header-bottom.container-fluid {
padding-left: 6px;
padding-right: 6px;
}
.header-bottom-info {
color: #6f6f6f;
padding-top: 0px;
@@ -357,15 +561,15 @@
</div>
</div>
</div>
<div class="row header-bottom-frames">
<div class="thumb-grid">
{% for result_info in archiveresults %}
{% if result_info.result %}
{% plugin_thumbnail result_info.result as thumbnail_html %}
{% if thumbnail_html %}
<div class="col-lg-2">
<div class="card{% if forloop.first %} selected-card{% endif %}">
{{ thumbnail_html }}
<div class="card-body">
{% plugin_card result_info.result as thumbnail_html %}
<div class="thumb-card{% if forloop.first %} selected-card{% endif %}">
<div class="thumbnail-wrapper">
{{ thumbnail_html }}
</div>
<div class="thumb-body">
<a href="{{ result_info.path }}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>{{ result_info.path }}</code></p>
</a>
@@ -373,18 +577,15 @@
<h4 class="card-title">{{ result_info.name|title }}</h4>
</a>
</div>
</div>
</div>
{% endif %}
{% endif %}
{% endfor %}
{% get_config "PREVIEW_ORIGINALS" as preview_originals %}
{% if preview_originals %}
<div class="col-lg-2">
<div class="card">
<div class="thumb-card">
<iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy" referrerpolicy="no-referrer"></iframe>
<div class="card-body">
<div class="thumb-body">
<a href="{{url}}" title="Open in new tab..." target="_blank" rel="noopener" referrerpolicy="no-referrer">
<p class="card-text"><code>🌐 {{domain}}</code></p>
</a>
@@ -392,7 +593,6 @@
<h4 class="card-title">Original</h4>
</a>
</div>
</div>
</div>
{% endif %}
</div>
@@ -417,19 +617,31 @@
}
// show selected file in iframe when preview card is clicked
jQuery('.card').on('click', function(e) {
jQuery('.thumb-card').on('click', function(e) {
jQuery('.selected-card').removeClass('selected-card')
jQuery(e.target).closest('.card').addClass('selected-card')
})
jQuery('.card a[target=preview]').on('click', function(e) {
if (e.currentTarget.href.endsWith('.pdf')) {
jQuery('.full-page-iframe')[0].removeAttribute('sandbox')
} else {
jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation"
jQuery(e.target).closest('.thumb-card').addClass('selected-card')
const link = e.target.closest('a[target=preview]') || e.currentTarget.querySelector('a[target=preview]') || e.currentTarget.querySelector('a')
if (!link || !link.href || link.href.endsWith('#')) {
return true
}
window.location.hash = getPreviewTypeFromPath(e.currentTarget)
const iframe = jQuery('.full-page-iframe')[0]
if (!iframe) {
return true
}
if (link.href.endsWith('.pdf')) {
iframe.removeAttribute('sandbox')
} else {
iframe.sandbox = "allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation"
}
window.location.hash = getPreviewTypeFromPath(link)
iframe.src = link.href
return true
})
jQuery('.thumb-card a[target=preview]').on('click', function(e) {
e.preventDefault()
return false
})
function hideSnapshotHeader() {
console.log('Collapsing Snapshot header...')
@@ -483,7 +695,7 @@
for (const link of jQuery('a[target=preview]')) {
console.log(link.pathname)
if (getPreviewTypeFromPath(link) == window.location.hash.slice(1).toLowerCase()) {
jQuery(link).closest('.card').click()
jQuery(link).closest('.thumb-card').click()
jQuery(link).click()
link.click()
}
@@ -502,7 +714,7 @@
// hide all preview iframes on small screens
if (window.innerWidth < 1091) {
jQuery('.card a[target=preview]').attr('target', '_self')
jQuery('.thumb-card a[target=preview]').attr('target', '_self')
}
var pdf_frame = document.querySelector('.pdf-frame');

View File

@@ -130,12 +130,15 @@
.header-bottom-frames .card {
box-shadow: 2px 2px 7px 0px rgba(0, 0, 0, 0.1);
margin-bottom: 5px;
margin-bottom: 6px;
border: 1px solid rgba(0, 0, 0, 0.06);
border-radius: 10px;
background-color: #efefef;
overflow: hidden;
height: 130px;
min-height: 130px;
}
.header-bottom-frames .card:has([data-compact]) {
min-height: 0;
}
.card h4 {
font-size: 0.8em;
@@ -144,7 +147,7 @@
text-transform: uppercase;
margin-top: 0px;
margin-bottom: 5px;
color: rgb(93, 105, 110);
color: #222;
}
.card-body {
font-size: 14px;
@@ -158,7 +161,8 @@
max-height: 102px;
overflow: hidden;
text-overflow: ellipsis;
color: #d3d3d3;
color: #222;
background-color: #f6f6f6;
}
.card-title {
margin-bottom: 4px;
@@ -213,6 +217,10 @@
background-color: #333;
pointer-events: none;
}
.thumbnail-wrapper.compact {
height: 32px;
background-color: #111;
}
.thumbnail-wrapper iframe {
width: 405%;
height: 430px;
@@ -228,10 +236,89 @@
object-fit: cover;
object-position: top center;
}
.thumbnail-compact {
height: 32px;
display: flex;
align-items: center;
gap: 6px;
padding: 0 8px;
font-size: 13px;
line-height: 1;
color: #bdbdbd;
text-transform: uppercase;
letter-spacing: 0.02em;
}
.thumbnail-compact-label {
color: #e1e1e1;
}
.thumbnail-compact-meta {
color: #777;
font-size: 11px;
margin-left: auto;
}
.thumbnail-compact svg,
.thumbnail-compact img {
height: 12px;
width: 12px;
}
.thumbnail-text {
height: 100px;
background: #121212;
color: #d8d8d8;
padding: 6px 8px;
display: flex;
flex-direction: column;
gap: 4px;
font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
font-size: 11px;
line-height: 1.2;
overflow: hidden;
}
.thumbnail-text-header {
display: flex;
align-items: center;
gap: 6px;
font-size: 10px;
color: #9b9b9b;
text-transform: uppercase;
letter-spacing: 0.04em;
}
.thumbnail-text-pre {
margin: 0;
white-space: pre-wrap;
overflow: hidden;
display: -webkit-box;
-webkit-box-orient: vertical;
-webkit-line-clamp: 5;
}
.thumbnail-text[data-plugin="title"] .thumbnail-text-pre {
font-size: 13px;
font-weight: 600;
-webkit-line-clamp: 3;
}
.card.selected-card {
border: 2px solid orange;
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
}
.thumb-compact .card-body {
display: block;
padding: 4px 8px;
font-size: 12px;
line-height: 1.2;
max-height: none;
}
.thumb-compact {
margin-bottom: 0px;
border-radius: 6px;
}
.thumb-compact .thumbnail-wrapper {
height: 32px;
}
.thumb-compact .thumbnail-compact,
.thumb-compact .thumbnail-text {
height: 32px;
max-height: 32px;
}
.iframe-large {
height: calc(100vh - 70px);
}
@@ -256,6 +343,83 @@
object-fit: cover;
object-position: top center;
}
.thumb-grid {
display: block;
column-width: 180px;
column-gap: 6px;
align-content: start;
width: 100%;
}
.thumb-card {
box-shadow: 2px 2px 7px 0px rgba(0, 0, 0, 0.1);
border: 1px solid rgba(0, 0, 0, 0.06);
border-radius: 10px;
background-color: #efefef;
overflow: hidden;
display: inline-block;
width: 100%;
break-inside: avoid;
box-sizing: border-box;
margin-bottom: 6px;
height: 138px;
min-height: 138px;
max-height: 138px;
}
.thumb-card:has([data-compact]) {
height: 46px;
min-height: 46px;
max-height: 46px;
}
.thumb-card .thumb-body {
font-size: 14px;
padding: 3px 8px;
line-height: 1.2;
word-wrap: break-word;
overflow: hidden;
text-overflow: ellipsis;
color: #222;
background-color: #f6f6f6;
}
.thumb-card .thumb-body h4 {
font-size: 0.8em;
text-transform: uppercase;
margin: 0 0 2px 0;
color: #222;
line-height: 1.1;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.thumb-card .thumbnail-wrapper,
.thumb-card iframe.card-img-top {
display: block;
width: 100%;
}
.thumb-card:has([data-compact]) .thumbnail-wrapper,
.thumb-card:has([data-compact]) .thumbnail-wrapper.compact {
height: 24px;
flex: 0 0 auto;
}
.thumb-card:has([data-compact]) .thumb-body {
padding: 2px 6px;
font-size: 12px;
max-height: 20px;
}
.thumb-card:has([data-compact]) .thumb-body h4 {
font-size: 0.9em;
margin-bottom: 0px;
line-height: 1;
display: flex;
align-items: center;
gap: 4px;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.thumb-card.selected-card {
border: 2px solid orange;
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
}
.header-bottom {
border-top: 1px solid rgba(170, 30, 85, 0.9);
padding-bottom: 1px;
@@ -268,6 +432,10 @@
box-shadow: 4px 4px 4px rgba(0,0,0,0.2);
margin-top: 0px;
}
.header-bottom.container-fluid {
padding-left: 6px;
padding-right: 6px;
}
.header-bottom-info {
color: #6f6f6f;
padding-top: 0px;
@@ -315,9 +483,41 @@
width: 100%;
overflow: hidden;
}
.header-bottom-frames {
.row.header-bottom-frames {
padding-top: 5px;
justify-content: center;
display: block !important;
width: 100%;
max-width: 100%;
column-width: 180px;
column-gap: 8px;
column-fill: auto;
margin-left: 0px;
margin-right: 0px;
flex: none !important;
}
.header-bottom-frames .col-lg-2 {
padding-left: 0px;
padding-right: 0px;
max-width: 100%;
width: 100% !important;
display: inline-block !important;
float: none !important;
flex: none !important;
break-inside: avoid;
margin-bottom: 6px;
vertical-align: top;
}
.header-bottom-frames .card:has([data-compact]) .thumbnail-wrapper,
.header-bottom-frames .card:has([data-compact]) .thumbnail-wrapper.compact {
height: 32px;
}
.header-bottom-frames .card:has([data-compact]) .thumbnail-text {
height: auto;
max-height: 64px;
}
.header-bottom-frames .card:has([data-compact]) .card-body {
padding: 4px 8px;
max-height: 44px;
}
.header-bottom-frames .card-title {
width: 100%;
@@ -325,7 +525,7 @@
font-size: 17px;
margin-bottom: 0px;
display: inline-block;
color: #d3d3d3;
color: #222;
font-weight: 200;
vertical-align: 3px;
}
@@ -415,7 +615,7 @@
</small>
</div>
<div class="col-lg-2" style="padding-top: 4px">
<a href="/archive/{{url}}" title="Date Added: {{bookmarked_date}} | First Archived: {{oldest_archive_date|default:downloaded_datestr}} | Last Checked: {{downloaded_datestr}} (UTC)">
<a href="/{{archive_path}}/index.html" title="Date Added: {{bookmarked_date}} | First Archived: {{oldest_archive_date|default:downloaded_datestr}} | Last Checked: {{downloaded_datestr}} (UTC)">
{{oldest_archive_date|default:downloaded_datestr|default:bookmarked_date}}
</a>
<br/>
@@ -431,34 +631,45 @@
</div>
</div>
<div class="header-bottom container-fluid">
<div class="row header-bottom-frames">
<div class="thumb-grid">
{% for result in archiveresults %}
<div class="col-lg-2">
<div class="card {% if forloop.first %}selected-card{% endif %}">
<div class="card-body">
<a href="{{result.path|urlencode}}" target="preview" title="./{{result.path}} (downloaded {{result.ts}})">
<div class="thumb-card{% if forloop.first %} selected-card{% endif %}">
{% with display_path=result.path %}
<div class="thumb-body">
{% if display_path %}
<a href="{{display_path|urlencode}}" target="preview" title="./{{display_path}} (downloaded {{result.ts}})">
<h4>{% plugin_icon result.name %} {{result.name|plugin_name|truncatechars:20}} <small>({{result.size|filesizeformat}})</small></h4>
</a>
{% else %}
<h4>{% plugin_icon result.name %} {{result.name|plugin_name|truncatechars:20}} <small>({{result.size|filesizeformat}})</small></h4>
</a>
{% endif %}
</div>
{% if result.result %}
{# Use plugin-specific thumbnail template when ArchiveResult is available #}
{% if result.result and display_path %}
{# Use plugin-specific card template when ArchiveResult is available #}
<div class="card-img-top thumbnail-wrapper">
{% plugin_thumbnail result.result %}
{% plugin_card result.result %}
</div>
{% else %}
{% elif result.is_metadata and display_path %}
<div class="card-img-top thumbnail-wrapper compact">
<div class="thumbnail-compact" data-plugin="{{result.name}}">
<span class="thumbnail-compact-icon">{% plugin_icon result.name %}</span>
<span class="thumbnail-compact-label">{{result.name|plugin_name}}</span>
<span class="thumbnail-compact-meta">metadata</span>
</div>
</div>
{% elif display_path %}
{# Fall back to generic iframe for filesystem-discovered files #}
<iframe class="card-img-top" src="{{result.path|urlencode}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
<iframe class="card-img-top" src="{{display_path|urlencode}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
{% endif %}
</div>
{% endwith %}
</div>
{% endfor %}
<div class="col-lg-2">
<div class="card">
<div class="card-body">
<div class="thumb-card">
<div class="thumb-body">
<a href="./" target="preview">
<h4>Headers, JSON, etc.</h4>
</a>
@@ -466,7 +677,6 @@
</div>
<iframe class="card-img-top" src="./" sandbox="" scrolling="no" loading="lazy"></iframe>
</div>
</div>
</div>
</div>
</header>
@@ -476,11 +686,11 @@
{% if best_result.result %}
{# Use plugin-specific fullscreen template when ArchiveResult is available #}
<div id="main-frame-wrapper" class="full-page-wrapper">
{% plugin_fullscreen best_result.result %}
{% plugin_full best_result.result %}
</div>
{% else %}
{# Fall back to generic iframe #}
<iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path|urlencode}}" name="preview"></iframe>
<iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path|default:'about:blank'|urlencode}}" name="preview"></iframe>
{% endif %}
@@ -513,21 +723,45 @@
return link.getAttribute('href')
}
const iframe_elem = document.getElementById('main-frame')
function ensureMainFrame() {
let frame = document.getElementById('main-frame')
if (!frame) {
const wrapper = document.getElementById('main-frame-wrapper')
frame = document.createElement('iframe')
frame.id = 'main-frame'
frame.name = 'preview'
frame.className = 'full-page-iframe'
frame.sandbox = "allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms"
if (wrapper) {
wrapper.innerHTML = ''
wrapper.appendChild(frame)
wrapper.classList.remove('full-page-wrapper')
}
}
return frame
}
for (const card of [...document.querySelectorAll('.card')]) {
for (const card of [...document.querySelectorAll('.thumb-card')]) {
card.addEventListener('click', function(event) {
const target = event.currentTarget.querySelector('a').href
const link = event.target.closest('a[target=preview]') || event.currentTarget.querySelector('a[target=preview]') || event.currentTarget.querySelector('a')
if (!link) {
return
}
const target = link.href
if (!target || target.endsWith('#')) {
return
}
jQuery('.selected-card').removeClass('selected-card')
jQuery(event.currentTarget).closest('.card').addClass('selected-card')
jQuery(event.currentTarget).closest('.thumb-card').addClass('selected-card')
const iframe_elem = ensureMainFrame()
if (target.endsWith('.pdf')) {
jQuery('#main-frame')[0].removeAttribute('sandbox')
iframe_elem.removeAttribute('sandbox')
} else {
jQuery('#main-frame')[0].sandbox = "allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms"
iframe_elem.sandbox = "allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms"
}
window.location.hash = getPreviewTypeFromPath(event.currentTarget.querySelector('a'))
window.location.hash = getPreviewTypeFromPath(link)
iframe_elem.src = target
})
@@ -587,7 +821,7 @@
for (const link of jQuery('a[target=preview]')) {
console.log(link.pathname)
if (getPreviewTypeFromPath(link) == window.location.hash.slice(1).toLowerCase()) {
jQuery(link).closest('.card').click()
jQuery(link).closest('.thumb-card').click()
jQuery(link).click()
link.click()
}

View File

@@ -698,7 +698,7 @@ class SnapshotWorker(Worker):
try:
# Get merged config (includes env vars passed via Process.env, snapshot.config, defaults, etc.)
config = get_config(snapshot=self.snapshot)
config = get_config(snapshot=self.snapshot, crawl=self.snapshot.crawl)
# Discover all hooks for this snapshot
hooks = discover_hooks('Snapshot', config=config)
@@ -842,14 +842,13 @@ class SnapshotWorker(Worker):
# Clear to avoid double-termination during on_shutdown
self.background_processes = {}
# Update STARTED background results now that hooks are done
# Update background results now that hooks are done
from archivebox.core.models import ArchiveResult
started_bg = self.snapshot.archiveresult_set.filter(
status=ArchiveResult.StatusChoices.STARTED,
bg_results = self.snapshot.archiveresult_set.filter(
hook_name__contains='.bg.',
)
for ar in started_bg:
for ar in bg_results:
ar.update_from_output()
def _reap_background_hooks(self) -> None:
@@ -867,7 +866,7 @@ class SnapshotWorker(Worker):
continue
ar = self.snapshot.archiveresult_set.filter(hook_name=hook_name).first()
if ar and ar.status == ArchiveResult.StatusChoices.STARTED:
if ar:
ar.update_from_output()
# Remove completed hook from tracking