This commit is contained in:
Nick Sweeting
2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions

View File

@@ -29,7 +29,8 @@
</center>
{% else %}
<div id="in-progress" style="display: none;">
<center><h3>Adding URLs to index and running archive methods...</h3>
<center><h3>Creating crawl and queueing snapshots...</h3>
<p>Your crawl is being created. The orchestrator will process URLs and create snapshots in the background.</p>
<br/>
<div class="loader"></div>
<br/>
@@ -37,16 +38,230 @@
</center>
</div>
<form id="add-form" method="POST" class="p-form">{% csrf_token %}
<h1>Add new URLs to your archive</h1>
<h1>Create a new Crawl</h1>
<div class="crawl-explanation">
<p>
A <strong>Crawl</strong> is a job that processes URLs and creates <strong>Snapshots</strong> (archived copies) for each URL discovered.
The settings below apply to the entire crawl and all snapshots it creates.
</p>
</div>
<br/>
{{ form.as_p }}
<!-- Basic fields -->
<div class="form-section">
<h3>Crawl Settings</h3>
<div class="form-field">
{{ form.url.label_tag }}
{{ form.url }}
<div id="url-counter" class="url-counter">0 URLs detected</div>
{% if form.url.errors %}
<div class="error">{{ form.url.errors }}</div>
{% endif %}
<div class="help-text">
Enter URLs to archive, one per line. Examples:<br/>
<code>https://example.com</code><br/>
<code>https://news.ycombinator.com</code><br/>
<code>https://github.com/ArchiveBox/ArchiveBox</code>
</div>
</div>
<div class="form-field">
{{ form.tag.label_tag }}
{{ form.tag }}
<!-- Tag autocomplete datalist -->
<datalist id="tag-datalist">
{% for tag_name in available_tags %}
<option value="{{ tag_name }}">
{% endfor %}
</datalist>
{% if form.tag.errors %}
<div class="error">{{ form.tag.errors }}</div>
{% endif %}
<div class="help-text">Tags will be applied to all snapshots created by this crawl. Start typing to see existing tags.</div>
</div>
<div class="form-field">
{{ form.depth.label_tag }}
{{ form.depth }}
{% if form.depth.errors %}
<div class="error">{{ form.depth.errors }}</div>
{% endif %}
<div class="help-text">Controls how many links deep the crawl will follow from the starting URLs.</div>
</div>
<div class="form-field">
{{ form.notes.label_tag }}
{{ form.notes }}
{% if form.notes.errors %}
<div class="error">{{ form.notes.errors }}</div>
{% endif %}
<div class="help-text">Optional description for this crawl (visible in the admin interface).</div>
</div>
</div>
<!-- Plugins section -->
<div class="form-section">
<h3>Crawl Plugins</h3>
<p class="section-description">
Select which archiving methods to run for all snapshots in this crawl. If none selected, all available plugins will be used.
<a href="/admin/environment/plugins/" target="_blank">View plugin details →</a>
</p>
<!-- Plugin Presets -->
<div class="plugin-presets">
<span class="preset-label">Quick Select:</span>
<button type="button" class="preset-btn" data-preset="quick-archive">📦 Quick Archive</button>
<button type="button" class="preset-btn" data-preset="full-chrome">🌐 Full Chrome</button>
<button type="button" class="preset-btn" data-preset="text-only">📄 Text Only</button>
<button type="button" class="preset-btn" data-preset="select-all">✓ Select All</button>
<button type="button" class="preset-btn" data-preset="clear-all">✗ Clear All</button>
</div>
<!-- Chrome-dependent plugins with "Select All" -->
<div class="plugin-group">
<div class="plugin-group-header">
<label>Chrome-dependent plugins</label>
<button type="button" class="select-all-btn" data-group="chrome">
Select All Chrome
</button>
</div>
<div class="plugin-checkboxes" id="chrome-plugins">
{{ form.chrome_plugins }}
</div>
</div>
<!-- Archiving plugins -->
<div class="plugin-group">
<div class="plugin-group-header">
<label>Archiving</label>
</div>
<div class="plugin-checkboxes">
{{ form.archiving_plugins }}
</div>
</div>
<!-- Parsing plugins -->
<div class="plugin-group">
<div class="plugin-group-header">
<label>Parsing</label>
</div>
<div class="plugin-checkboxes">
{{ form.parsing_plugins }}
</div>
</div>
<!-- Search plugins -->
<div class="plugin-group">
<div class="plugin-group-header">
<label>Search</label>
</div>
<div class="plugin-checkboxes">
{{ form.search_plugins }}
</div>
</div>
<!-- Binary provider plugins -->
<div class="plugin-group">
<div class="plugin-group-header">
<label>Binary Providers</label>
</div>
<div class="plugin-checkboxes">
{{ form.binary_plugins }}
</div>
</div>
<!-- Extension plugins -->
<div class="plugin-group">
<div class="plugin-group-header">
<label>Browser Extensions</label>
</div>
<div class="plugin-checkboxes">
{{ form.extension_plugins }}
</div>
</div>
</div>
<!-- Advanced options (collapsible) -->
<div class="form-section">
<details class="advanced-section">
<summary><h3>Advanced Crawl Options</h3></summary>
<p class="section-description">Additional settings that control how this crawl processes URLs and creates snapshots.</p>
<div class="form-field">
{{ form.schedule.label_tag }}
{{ form.schedule }}
{% if form.schedule.errors %}
<div class="error">{{ form.schedule.errors }}</div>
{% endif %}
<div class="help-text">
Optional: Schedule this crawl to repeat automatically. Examples:<br/>
<code>daily</code> - Run once per day<br/>
<code>weekly</code> - Run once per week<br/>
<code>0 */6 * * *</code> - Every 6 hours (cron format)<br/>
<code>0 0 * * 0</code> - Every Sunday at midnight (cron format)
</div>
</div>
<div class="form-field">
{{ form.persona.label_tag }}
{{ form.persona }}
{% if form.persona.errors %}
<div class="error">{{ form.persona.errors }}</div>
{% endif %}
<div class="help-text">
Authentication profile to use for all snapshots in this crawl.
<a href="/admin/personas/persona/add/" target="_blank">Create new persona →</a>
</div>
</div>
<div class="form-field checkbox-field">
{{ form.overwrite }}
{{ form.overwrite.label_tag }}
{% if form.overwrite.errors %}
<div class="error">{{ form.overwrite.errors }}</div>
{% endif %}
<div class="help-text">Re-archive URLs even if they already exist</div>
</div>
<div class="form-field checkbox-field">
{{ form.update }}
{{ form.update.label_tag }}
{% if form.update.errors %}
<div class="error">{{ form.update.errors }}</div>
{% endif %}
<div class="help-text">Retry archiving URLs that previously failed</div>
</div>
<div class="form-field checkbox-field">
{{ form.index_only }}
{{ form.index_only.label_tag }}
{% if form.index_only.errors %}
<div class="error">{{ form.index_only.errors }}</div>
{% endif %}
<div class="help-text">Create snapshots but don't run archiving plugins yet (queue for later)</div>
</div>
<div class="form-field">
{{ form.config.label_tag }}
{{ form.config }}
{% if form.config.errors %}
<div class="error">{{ form.config.errors }}</div>
{% endif %}
<div class="help-text">
Override any config option for this crawl (e.g., TIMEOUT, USER_AGENT, CHROME_BINARY, etc.)
</div>
</div>
</details>
</div>
<center>
<button role="submit" id="submit">&nbsp; Add URLs and archive </button>
<button role="submit" id="submit">&nbsp; Create Crawl and Start Archiving </button>
</center>
</form>
<br/><br/><br/>
<center id="delay-warning" style="display: none">
<small>(you will be redirected to your <a href="/">Snapshot list</a> momentarily, its safe to close this page at any time)</small>
<small>(you will be redirected to your new Crawl page momentarily, it's safe to close this page at any time)</small>
</center>
{% if absolute_add_path %}
<!-- <center id="bookmarklet">
@@ -55,6 +270,109 @@
</center> -->
{% endif %}
<script>
// URL Counter - detect URLs in textarea using regex
const urlTextarea = document.querySelector('textarea[name="url"]');
const urlCounter = document.getElementById('url-counter');
function updateURLCount() {
const text = urlTextarea.value;
// Match http(s):// URLs
const urlRegex = /https?:\/\/[^\s]+/gi;
const matches = text.match(urlRegex) || [];
const count = matches.length;
urlCounter.textContent = `${count} URL${count !== 1 ? 's' : ''} detected`;
urlCounter.className = count > 0 ? 'url-counter url-counter-positive' : 'url-counter';
}
urlTextarea.addEventListener('input', updateURLCount);
updateURLCount(); // Initial count
// Plugin Presets
const presetConfigs = {
'quick-archive': ['screenshot', 'dom', 'favicon', 'wget', 'title'],
'full-chrome': ['chrome', 'screenshot', 'pdf', 'dom', 'singlefile', 'consolelog', 'redirects', 'responses', 'ssl', 'headers', 'title', 'accessibility', 'seo'],
'text-only': ['wget', 'readability', 'mercury', 'htmltotext', 'title', 'favicon']
};
document.querySelectorAll('.preset-btn').forEach(btn => {
btn.addEventListener('click', function() {
const preset = this.dataset.preset;
const allCheckboxes = document.querySelectorAll('.plugin-checkboxes input[type="checkbox"]');
if (preset === 'select-all') {
allCheckboxes.forEach(cb => cb.checked = true);
} else if (preset === 'clear-all') {
allCheckboxes.forEach(cb => cb.checked = false);
} else if (presetConfigs[preset]) {
const pluginsToSelect = presetConfigs[preset];
allCheckboxes.forEach(cb => {
cb.checked = pluginsToSelect.includes(cb.value);
});
}
// Save to localStorage after preset selection
saveFormState();
});
});
// Select All Chrome button handler
document.querySelectorAll('.select-all-btn').forEach(btn => {
btn.addEventListener('click', function() {
const group = this.dataset.group;
const container = document.getElementById(group + '-plugins');
const checkboxes = container.querySelectorAll('input[type="checkbox"]');
const allChecked = Array.from(checkboxes).every(cb => cb.checked);
checkboxes.forEach(cb => {
cb.checked = !allChecked;
});
this.textContent = allChecked ? 'Select All Chrome' : 'Deselect All Chrome';
saveFormState();
});
});
// LocalStorage: Save/Load form state (all fields including URLs for repeat crawls)
const STORAGE_KEY = 'archivebox_add_form_state';
function saveFormState() {
const state = {};
document.querySelectorAll('#add-form input, #add-form textarea, #add-form select').forEach(el => {
if (el.name === 'csrfmiddlewaretoken') return;
if (el.type === 'checkbox' || el.type === 'radio') {
state[el.name + ':' + el.value] = el.checked;
} else {
state[el.name] = el.value;
}
});
localStorage.setItem(STORAGE_KEY, JSON.stringify(state));
}
function loadFormState() {
try {
const state = JSON.parse(localStorage.getItem(STORAGE_KEY) || '{}');
for (const [key, value] of Object.entries(state)) {
if (key.includes(':')) {
const [name, val] = key.split(':');
const el = document.querySelector(`[name="${name}"][value="${val}"]`);
if (el) el.checked = value;
} else {
const el = document.querySelector(`[name="${key}"]`);
if (el && el.type !== 'checkbox' && el.type !== 'radio') el.value = value;
}
}
updateURLCount(); // Update counter after loading URLs
} catch (e) {}
}
// Auto-save on changes
document.querySelectorAll('#add-form input, #add-form textarea, #add-form select').forEach(el => {
el.addEventListener('change', saveFormState);
});
loadFormState();
// Form submission handler
document.getElementById('add-form').addEventListener('submit', function(event) {
document.getElementById('in-progress').style.display = 'block'
document.getElementById('add-form').style.display = 'none'

View File

@@ -1,4 +1,4 @@
{% load tz core_tags %}
{% load tz core_tags config_tags %}
<!DOCTYPE html>
<html lang="en">
@@ -358,64 +358,26 @@
</div>
</div>
<div class="row header-bottom-frames">
<div class="col-lg-2">
<div class="card selected-card">
<iframe class="card-img-top" src="{{singlefile_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
<div class="card-body">
<a href="{{singlefile_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./singlefile.html</code></p>
</a>
<a href="{{singlefile_path}}" target="preview"><h4 class="card-title">Chrome &gt; SingleFile</h4></a>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top pdf-frame" src="{{pdf_path}}#toolbar=0" scrolling="no" loading="lazy"></iframe>
<div class="card-body">
<a href="{{pdf_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./output.pdf</code></p>
</a>
<a href="{{pdf_path}}" target="preview" id="pdf-btn"><h4 class="card-title">Chrome &gt; PDF</h4></a>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<img class="card-img-top" src="{{screenshot_path}}" onerror="this.style.opacity=0.2"/>
<div class="card-body">
<a href="{{screenshot_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./screenshot.png</code></p>
</a>
<a href="{{screenshot_path}}" target="preview"><h4 class="card-title">Chrome &gt; Screenshot</h4></a>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{archive_url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
<div class="card-body">
<a href="{{archive_url}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./{{domain}}</code></p>
</a>
<a href="{{archive_url}}" target="preview"><h4 class="card-title">Wget &gt; HTML</h4></a>
{% for result_info in archiveresults %}
{% if result_info.result %}
<div class="col-lg-2">
<div class="card{% if forloop.first %} selected-card{% endif %}">
{% plugin_thumbnail result_info.result %}
<div class="card-body">
<a href="{{ result_info.path }}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>{{ result_info.path }}</code></p>
</a>
<a href="{{ result_info.path }}" target="preview">
<h4 class="card-title">{{ result_info.name|title }}</h4>
</a>
</div>
</div>
</div>
</div>
</div>
{% if SAVE_ARCHIVE_DOT_ORG %}
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{archive_org_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
<div class="card-body">
<a href="{{archive_org_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>🌐 web.archive.org/web/...</code></p>
</a>
<a href="{{archive_org_path}}" target="preview" id="archive_dot_org-btn"><h4 class="card-title">Archive.Org</h4></a>
</div>
</div>
</div>
{% endif %}
{% if PREVIEW_ORIGINALS %}
{% endif %}
{% endfor %}
{% get_config "PREVIEW_ORIGINALS" as preview_originals %}
{% if preview_originals %}
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy" referrerpolicy="no-referrer"></iframe>
@@ -426,77 +388,10 @@
<a href="{{url}}" target="preview" id="original-btn" referrerpolicy="no-referrer">
<h4 class="card-title">Original</h4>
</a>
</div>
</div>
</div>
</div>
{% endif %}
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
<div class="card-body">
<a href="{{headers_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./headers.json</code></p>
</a>
<a href="{{headers_path}}" target="preview"><h4 class="card-title">Headers</h4></a>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{dom_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
<div class="card-body">
<a href="{{dom_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./output.html</code></p>
</a>
<a href="{{dom_path}}" target="preview"><h4 class="card-title">Chrome &gt; HTML</h4></a>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{readability_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
<div class="card-body">
<a href="{{readability_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./readability/content.html</code></p>
</a>
<a href="{{readability_path}}" target="preview"><h4 class="card-title">Readability</h4></a>
</div>
</div>
</div>
<br/>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{mercury_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
<div class="card-body">
<a href="{{mercury_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./mercury/content.html</code></p>
</a>
<a href="{{mercury_path}}" target="preview"><h4 class="card-title">Mercury</h4></a>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{media_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
<div class="card-body">
<a href="{{media_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./media/*.mp4</code></p>
</a>
<a href="{{media_path}}" target="preview"><h4 class="card-title">Media</h4></a>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{git_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
<div class="card-body">
<a href="{{git_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./git/*.git</code></p>
</a>
<a href="{{git_path}}" target="preview"><h4 class="card-title">Git</h4></a>
</div>
</div>
</div>
</div>
</div>
</header>