mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
wip
This commit is contained in:
@@ -4,10 +4,14 @@ from django import forms
|
||||
|
||||
from archivebox.misc.util import URL_REGEX
|
||||
from taggit.utils import edit_string_for_tags, parse_tags
|
||||
from archivebox.base_models.admin import KeyValueWidget
|
||||
|
||||
DEPTH_CHOICES = (
|
||||
('0', 'depth = 0 (archive just these URLs)'),
|
||||
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
|
||||
('1', 'depth = 1 (+ URLs one hop away)'),
|
||||
('2', 'depth = 2 (+ URLs two hops away)'),
|
||||
('3', 'depth = 3 (+ URLs three hops away)'),
|
||||
('4', 'depth = 4 (+ URLs four hops away)'),
|
||||
)
|
||||
|
||||
from archivebox.hooks import get_plugins
|
||||
@@ -18,39 +22,180 @@ def get_plugin_choices():
|
||||
|
||||
|
||||
class AddLinkForm(forms.Form):
|
||||
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
|
||||
tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
|
||||
depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
|
||||
plugins = forms.MultipleChoiceField(
|
||||
label="Plugins (select at least 1, otherwise all will be used by default)",
|
||||
# Basic fields
|
||||
url = forms.RegexField(
|
||||
label="URLs (one per line)",
|
||||
regex=URL_REGEX,
|
||||
min_length='6',
|
||||
strip=True,
|
||||
widget=forms.Textarea,
|
||||
required=True
|
||||
)
|
||||
tag = forms.CharField(
|
||||
label="Tags (comma separated tag1,tag2,tag3)",
|
||||
strip=True,
|
||||
required=False,
|
||||
widget=forms.TextInput(attrs={
|
||||
'list': 'tag-datalist',
|
||||
'autocomplete': 'off',
|
||||
})
|
||||
)
|
||||
depth = forms.ChoiceField(
|
||||
label="Archive depth",
|
||||
choices=DEPTH_CHOICES,
|
||||
initial='0',
|
||||
widget=forms.RadioSelect(attrs={"class": "depth-selection"})
|
||||
)
|
||||
notes = forms.CharField(
|
||||
label="Notes",
|
||||
strip=True,
|
||||
required=False,
|
||||
widget=forms.Textarea(attrs={
|
||||
'rows': 3,
|
||||
'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
|
||||
})
|
||||
)
|
||||
|
||||
# Plugin groups
|
||||
chrome_plugins = forms.MultipleChoiceField(
|
||||
label="Chrome-dependent plugins",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[], # populated in __init__
|
||||
)
|
||||
archiving_plugins = forms.MultipleChoiceField(
|
||||
label="Archiving",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
parsing_plugins = forms.MultipleChoiceField(
|
||||
label="Parsing",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
search_plugins = forms.MultipleChoiceField(
|
||||
label="Search",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
binary_plugins = forms.MultipleChoiceField(
|
||||
label="Binary providers",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
extension_plugins = forms.MultipleChoiceField(
|
||||
label="Browser extensions",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
|
||||
# Advanced options
|
||||
schedule = forms.CharField(
|
||||
label="Repeat schedule",
|
||||
max_length=64,
|
||||
required=False,
|
||||
widget=forms.TextInput(attrs={
|
||||
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
|
||||
})
|
||||
)
|
||||
persona = forms.CharField(
|
||||
label="Persona (authentication profile)",
|
||||
max_length=100,
|
||||
initial='Default',
|
||||
required=False,
|
||||
)
|
||||
overwrite = forms.BooleanField(
|
||||
label="Overwrite existing snapshots",
|
||||
initial=False,
|
||||
required=False,
|
||||
)
|
||||
update = forms.BooleanField(
|
||||
label="Update/retry previously failed URLs",
|
||||
initial=False,
|
||||
required=False,
|
||||
)
|
||||
index_only = forms.BooleanField(
|
||||
label="Index only (don't archive yet)",
|
||||
initial=False,
|
||||
required=False,
|
||||
)
|
||||
config = forms.JSONField(
|
||||
label="Custom config overrides",
|
||||
widget=KeyValueWidget(),
|
||||
initial=dict,
|
||||
required=False,
|
||||
widget=forms.SelectMultiple,
|
||||
choices=[], # populated dynamically in __init__
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.fields['plugins'].choices = get_plugin_choices()
|
||||
# TODO: hook these up to the view and put them
|
||||
# in a collapsible UI section labeled "Advanced"
|
||||
#
|
||||
# exclude_patterns = forms.CharField(
|
||||
# label="Exclude patterns",
|
||||
# min_length='1',
|
||||
# required=False,
|
||||
# initial=URL_DENYLIST,
|
||||
# )
|
||||
# timeout = forms.IntegerField(
|
||||
# initial=TIMEOUT,
|
||||
# )
|
||||
# overwrite = forms.BooleanField(
|
||||
# label="Overwrite any existing Snapshots",
|
||||
# initial=False,
|
||||
# )
|
||||
# index_only = forms.BooleanField(
|
||||
# label="Add URLs to index without Snapshotting",
|
||||
# initial=False,
|
||||
# )
|
||||
|
||||
# Import at runtime to avoid circular imports
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
# Get all plugins
|
||||
all_plugins = get_plugins()
|
||||
|
||||
# Define plugin groups
|
||||
chrome_dependent = {
|
||||
'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
|
||||
'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
|
||||
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
|
||||
}
|
||||
archiving = {
|
||||
'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
|
||||
'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
|
||||
}
|
||||
parsing = {
|
||||
'parse_html_urls', 'parse_jsonl_urls',
|
||||
'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
|
||||
}
|
||||
search = {
|
||||
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
|
||||
}
|
||||
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
|
||||
extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
|
||||
|
||||
# Populate plugin field choices
|
||||
self.fields['chrome_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in chrome_dependent
|
||||
]
|
||||
self.fields['archiving_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in archiving
|
||||
]
|
||||
self.fields['parsing_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in parsing
|
||||
]
|
||||
self.fields['search_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in search
|
||||
]
|
||||
self.fields['binary_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in binary
|
||||
]
|
||||
self.fields['extension_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in extensions
|
||||
]
|
||||
|
||||
# Set update default from config
|
||||
self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
|
||||
|
||||
def clean(self):
|
||||
cleaned_data = super().clean()
|
||||
|
||||
# Combine all plugin groups into single list
|
||||
all_selected_plugins = []
|
||||
for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
|
||||
'search_plugins', 'binary_plugins', 'extension_plugins']:
|
||||
all_selected_plugins.extend(cleaned_data.get(field, []))
|
||||
|
||||
# Store combined list for easy access
|
||||
cleaned_data['plugins'] = all_selected_plugins
|
||||
|
||||
return cleaned_data
|
||||
|
||||
class TagWidgetMixin:
|
||||
def format_value(self, value):
|
||||
|
||||
Reference in New Issue
Block a user