mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
239 lines
7.3 KiB
Python
239 lines
7.3 KiB
Python
__package__ = 'archivebox.core'
|
|
|
|
from django import forms
|
|
|
|
from archivebox.misc.util import URL_REGEX
|
|
from taggit.utils import edit_string_for_tags, parse_tags
|
|
from archivebox.base_models.admin import KeyValueWidget
|
|
|
|
DEPTH_CHOICES = (
|
|
('0', 'depth = 0 (archive just these URLs)'),
|
|
('1', 'depth = 1 (+ URLs one hop away)'),
|
|
('2', 'depth = 2 (+ URLs two hops away)'),
|
|
('3', 'depth = 3 (+ URLs three hops away)'),
|
|
('4', 'depth = 4 (+ URLs four hops away)'),
|
|
)
|
|
|
|
from archivebox.hooks import get_plugins
|
|
|
|
def get_plugin_choices():
|
|
"""Get available extractor plugins from discovered hooks."""
|
|
return [(name, name) for name in get_plugins()]
|
|
|
|
|
|
class AddLinkForm(forms.Form):
|
|
# Basic fields
|
|
url = forms.RegexField(
|
|
label="URLs (one per line)",
|
|
regex=URL_REGEX,
|
|
min_length='6',
|
|
strip=True,
|
|
widget=forms.Textarea,
|
|
required=True
|
|
)
|
|
tag = forms.CharField(
|
|
label="Tags (comma separated tag1,tag2,tag3)",
|
|
strip=True,
|
|
required=False,
|
|
widget=forms.TextInput(attrs={
|
|
'list': 'tag-datalist',
|
|
'autocomplete': 'off',
|
|
})
|
|
)
|
|
depth = forms.ChoiceField(
|
|
label="Archive depth",
|
|
choices=DEPTH_CHOICES,
|
|
initial='0',
|
|
widget=forms.RadioSelect(attrs={"class": "depth-selection"})
|
|
)
|
|
notes = forms.CharField(
|
|
label="Notes",
|
|
strip=True,
|
|
required=False,
|
|
widget=forms.Textarea(attrs={
|
|
'rows': 3,
|
|
'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
|
|
})
|
|
)
|
|
|
|
# Plugin groups
|
|
chrome_plugins = forms.MultipleChoiceField(
|
|
label="Chrome-dependent plugins",
|
|
required=False,
|
|
widget=forms.CheckboxSelectMultiple,
|
|
choices=[], # populated in __init__
|
|
)
|
|
archiving_plugins = forms.MultipleChoiceField(
|
|
label="Archiving",
|
|
required=False,
|
|
widget=forms.CheckboxSelectMultiple,
|
|
choices=[],
|
|
)
|
|
parsing_plugins = forms.MultipleChoiceField(
|
|
label="Parsing",
|
|
required=False,
|
|
widget=forms.CheckboxSelectMultiple,
|
|
choices=[],
|
|
)
|
|
search_plugins = forms.MultipleChoiceField(
|
|
label="Search",
|
|
required=False,
|
|
widget=forms.CheckboxSelectMultiple,
|
|
choices=[],
|
|
)
|
|
binary_plugins = forms.MultipleChoiceField(
|
|
label="Binary providers",
|
|
required=False,
|
|
widget=forms.CheckboxSelectMultiple,
|
|
choices=[],
|
|
)
|
|
extension_plugins = forms.MultipleChoiceField(
|
|
label="Browser extensions",
|
|
required=False,
|
|
widget=forms.CheckboxSelectMultiple,
|
|
choices=[],
|
|
)
|
|
|
|
# Advanced options
|
|
schedule = forms.CharField(
|
|
label="Repeat schedule",
|
|
max_length=64,
|
|
required=False,
|
|
widget=forms.TextInput(attrs={
|
|
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
|
|
})
|
|
)
|
|
persona = forms.CharField(
|
|
label="Persona (authentication profile)",
|
|
max_length=100,
|
|
initial='Default',
|
|
required=False,
|
|
)
|
|
overwrite = forms.BooleanField(
|
|
label="Overwrite existing snapshots",
|
|
initial=False,
|
|
required=False,
|
|
)
|
|
update = forms.BooleanField(
|
|
label="Update/retry previously failed URLs",
|
|
initial=False,
|
|
required=False,
|
|
)
|
|
index_only = forms.BooleanField(
|
|
label="Index only (don't archive yet)",
|
|
initial=False,
|
|
required=False,
|
|
)
|
|
config = forms.JSONField(
|
|
label="Custom config overrides",
|
|
widget=KeyValueWidget(),
|
|
initial=dict,
|
|
required=False,
|
|
)
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
|
|
# Import at runtime to avoid circular imports
|
|
from archivebox.config.common import ARCHIVING_CONFIG
|
|
|
|
# Get all plugins
|
|
all_plugins = get_plugins()
|
|
|
|
# Define plugin groups
|
|
chrome_dependent = {
|
|
'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
|
|
'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
|
|
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
|
|
}
|
|
archiving = {
|
|
'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
|
|
'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
|
|
}
|
|
parsing = {
|
|
'parse_html_urls', 'parse_jsonl_urls',
|
|
'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
|
|
}
|
|
search = {
|
|
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
|
|
}
|
|
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
|
|
extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'}
|
|
|
|
# Populate plugin field choices
|
|
self.fields['chrome_plugins'].choices = [
|
|
(p, p) for p in sorted(all_plugins) if p in chrome_dependent
|
|
]
|
|
self.fields['archiving_plugins'].choices = [
|
|
(p, p) for p in sorted(all_plugins) if p in archiving
|
|
]
|
|
self.fields['parsing_plugins'].choices = [
|
|
(p, p) for p in sorted(all_plugins) if p in parsing
|
|
]
|
|
self.fields['search_plugins'].choices = [
|
|
(p, p) for p in sorted(all_plugins) if p in search
|
|
]
|
|
self.fields['binary_plugins'].choices = [
|
|
(p, p) for p in sorted(all_plugins) if p in binary
|
|
]
|
|
self.fields['extension_plugins'].choices = [
|
|
(p, p) for p in sorted(all_plugins) if p in extensions
|
|
]
|
|
|
|
# Set update default from config
|
|
self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
|
|
|
|
def clean(self):
|
|
cleaned_data = super().clean()
|
|
|
|
# Combine all plugin groups into single list
|
|
all_selected_plugins = []
|
|
for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
|
|
'search_plugins', 'binary_plugins', 'extension_plugins']:
|
|
all_selected_plugins.extend(cleaned_data.get(field, []))
|
|
|
|
# Store combined list for easy access
|
|
cleaned_data['plugins'] = all_selected_plugins
|
|
|
|
return cleaned_data
|
|
|
|
class TagWidgetMixin:
|
|
def format_value(self, value):
|
|
if value is not None and not isinstance(value, str):
|
|
value = edit_string_for_tags(value)
|
|
return super().format_value(value)
|
|
|
|
class TagWidget(TagWidgetMixin, forms.TextInput):
|
|
pass
|
|
|
|
class TagField(forms.CharField):
|
|
widget = TagWidget
|
|
|
|
def clean(self, value):
|
|
value = super().clean(value)
|
|
try:
|
|
return parse_tags(value)
|
|
except ValueError:
|
|
raise forms.ValidationError(
|
|
"Please provide a comma-separated list of tags."
|
|
)
|
|
|
|
def has_changed(self, initial_value, data_value):
|
|
# Always return False if the field is disabled since self.bound_data
|
|
# always uses the initial value in this case.
|
|
if self.disabled:
|
|
return False
|
|
|
|
try:
|
|
data_value = self.clean(data_value)
|
|
except forms.ValidationError:
|
|
pass
|
|
|
|
if initial_value is None:
|
|
initial_value = []
|
|
|
|
initial_value = [tag.name for tag in initial_value]
|
|
initial_value.sort()
|
|
|
|
return initial_value != data_value
|