Files
ArchiveBox/archivebox/core/forms.py
2025-12-30 16:12:53 -08:00

239 lines
7.3 KiB
Python

__package__ = 'archivebox.core'
from django import forms
from archivebox.misc.util import URL_REGEX
from taggit.utils import edit_string_for_tags, parse_tags
from archivebox.base_models.admin import KeyValueWidget
DEPTH_CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),
('1', 'depth = 1 (+ URLs one hop away)'),
('2', 'depth = 2 (+ URLs two hops away)'),
('3', 'depth = 3 (+ URLs three hops away)'),
('4', 'depth = 4 (+ URLs four hops away)'),
)
from archivebox.hooks import get_plugins
def get_plugin_choices():
"""Get available extractor plugins from discovered hooks."""
return [(name, name) for name in get_plugins()]
class AddLinkForm(forms.Form):
# Basic fields
url = forms.RegexField(
label="URLs (one per line)",
regex=URL_REGEX,
min_length='6',
strip=True,
widget=forms.Textarea,
required=True
)
tag = forms.CharField(
label="Tags (comma separated tag1,tag2,tag3)",
strip=True,
required=False,
widget=forms.TextInput(attrs={
'list': 'tag-datalist',
'autocomplete': 'off',
})
)
depth = forms.ChoiceField(
label="Archive depth",
choices=DEPTH_CHOICES,
initial='0',
widget=forms.RadioSelect(attrs={"class": "depth-selection"})
)
notes = forms.CharField(
label="Notes",
strip=True,
required=False,
widget=forms.Textarea(attrs={
'rows': 3,
'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
})
)
# Plugin groups
chrome_plugins = forms.MultipleChoiceField(
label="Chrome-dependent plugins",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[], # populated in __init__
)
archiving_plugins = forms.MultipleChoiceField(
label="Archiving",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
parsing_plugins = forms.MultipleChoiceField(
label="Parsing",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
search_plugins = forms.MultipleChoiceField(
label="Search",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
binary_plugins = forms.MultipleChoiceField(
label="Binary providers",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
extension_plugins = forms.MultipleChoiceField(
label="Browser extensions",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
# Advanced options
schedule = forms.CharField(
label="Repeat schedule",
max_length=64,
required=False,
widget=forms.TextInput(attrs={
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
})
)
persona = forms.CharField(
label="Persona (authentication profile)",
max_length=100,
initial='Default',
required=False,
)
overwrite = forms.BooleanField(
label="Overwrite existing snapshots",
initial=False,
required=False,
)
update = forms.BooleanField(
label="Update/retry previously failed URLs",
initial=False,
required=False,
)
index_only = forms.BooleanField(
label="Index only (don't archive yet)",
initial=False,
required=False,
)
config = forms.JSONField(
label="Custom config overrides",
widget=KeyValueWidget(),
initial=dict,
required=False,
)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Import at runtime to avoid circular imports
from archivebox.config.common import ARCHIVING_CONFIG
# Get all plugins
all_plugins = get_plugins()
# Define plugin groups
chrome_dependent = {
'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
}
archiving = {
'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
}
parsing = {
'parse_html_urls', 'parse_jsonl_urls',
'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
}
search = {
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
}
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'}
# Populate plugin field choices
self.fields['chrome_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in chrome_dependent
]
self.fields['archiving_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in archiving
]
self.fields['parsing_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in parsing
]
self.fields['search_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in search
]
self.fields['binary_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in binary
]
self.fields['extension_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in extensions
]
# Set update default from config
self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
def clean(self):
cleaned_data = super().clean()
# Combine all plugin groups into single list
all_selected_plugins = []
for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
'search_plugins', 'binary_plugins', 'extension_plugins']:
all_selected_plugins.extend(cleaned_data.get(field, []))
# Store combined list for easy access
cleaned_data['plugins'] = all_selected_plugins
return cleaned_data
class TagWidgetMixin:
def format_value(self, value):
if value is not None and not isinstance(value, str):
value = edit_string_for_tags(value)
return super().format_value(value)
class TagWidget(TagWidgetMixin, forms.TextInput):
pass
class TagField(forms.CharField):
widget = TagWidget
def clean(self, value):
value = super().clean(value)
try:
return parse_tags(value)
except ValueError:
raise forms.ValidationError(
"Please provide a comma-separated list of tags."
)
def has_changed(self, initial_value, data_value):
# Always return False if the field is disabled since self.bound_data
# always uses the initial value in this case.
if self.disabled:
return False
try:
data_value = self.clean(data_value)
except forms.ValidationError:
pass
if initial_value is None:
initial_value = []
initial_value = [tag.name for tag in initial_value]
initial_value.sort()
return initial_value != data_value