Merge branch 'v0.5.0' of github.com:ArchiveBox/ArchiveBox into feat-snapshots-grid

This commit is contained in:
jdcaballerov
2020-12-14 15:05:19 -05:00
32 changed files with 401 additions and 237 deletions

View File

@@ -63,7 +63,7 @@ def run_subcommand(subcommand: str,
if subcommand not in meta_cmds:
from ..config import setup_django
setup_django(in_memory_db=subcommand in fake_db)
setup_django(in_memory_db=subcommand in fake_db, check_db=subcommand in archive_cmds)
module = import_module('.archivebox_{}'.format(subcommand), __package__)
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore

View File

@@ -89,8 +89,8 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
index_only=command.index_only,
overwrite=command.overwrite,
init=command.init,
out_dir=pwd or OUTPUT_DIR,
extractors=command.extract,
out_dir=pwd or OUTPUT_DIR,
)

View File

@@ -36,6 +36,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
' ~/Desktop/sites_list.csv\n'
)
)
parser.add_argument(
"--extract",
type=str,
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
This does not take precedence over the configuration",
default=""
)
parser.add_argument(
'--out-dir',
type=str,
@@ -55,6 +62,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
oneshot(
url=stdin_url or url,
out_dir=Path(command.out_dir).resolve(),
extractors=command.extract,
)

View File

@@ -161,6 +161,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'USE_CHROME': {'type': bool, 'default': True},
'USE_NODE': {'type': bool, 'default': True},
'USE_YOUTUBEDL': {'type': bool, 'default': True},
'USE_RIPGREP': {'type': bool, 'default': True},
'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'},
@@ -170,6 +171,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'MERCURY_BINARY': {'type': str, 'default': 'mercury-parser'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None},
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
@@ -275,7 +277,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}},
'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent},
'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME / 'legacy'},
'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
@@ -312,6 +314,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
@@ -320,7 +323,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if (c['USE_MERCURY'] and c['MERCURY_BINARY']) else None}, # mercury is unversioned
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury is unversioned
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
@@ -334,8 +337,6 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])},
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
@@ -343,6 +344,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']},
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])},
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
@@ -595,7 +599,7 @@ def bin_path(binary: Optional[str]) -> Optional[str]:
if node_modules_bin.exists():
return str(node_modules_bin.resolve())
return shutil.which(Path(binary).expanduser()) or binary
return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
def bin_hash(binary: Optional[str]) -> Optional[str]:
if binary is None:
@@ -682,7 +686,7 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
'TEMPLATES_DIR': {
'path': (config['TEMPLATES_DIR']).resolve(),
'enabled': True,
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
'is_valid': (config['TEMPLATES_DIR'] / config['ACTIVE_THEME'] / 'static').exists(),
},
# 'NODE_MODULES_DIR': {
# 'path': ,
@@ -826,6 +830,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': config['USE_CHROME'],
'is_valid': bool(config['CHROME_VERSION']),
},
'RIPGREP_BINARY': {
'path': bin_path(config['RIPGREP_BINARY']),
'version': config['RIPGREP_VERSION'],
'hash': bin_hash(config['RIPGREP_BINARY']),
'enabled': config['USE_RIPGREP'],
'is_valid': bool(config['RIPGREP_VERSION']),
},
}
def get_chrome_info(config: ConfigDict) -> ConfigValue:

View File

@@ -10,11 +10,22 @@ CHOICES = (
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
)
from ..extractors import get_default_archive_methods
ARCHIVE_METHODS = [
(name, name)
for name, _, _ in get_default_archive_methods()
]
class AddLinkForm(forms.Form):
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
archive_methods = forms.MultipleChoiceField(
required=False,
widget=forms.SelectMultiple,
choices=ARCHIVE_METHODS,
)
class TagWidgetMixin:
def format_value(self, value):
if value is not None and not isinstance(value, str):

View File

@@ -9,6 +9,12 @@ import django.db.models.deletion
from config import CONFIG
from index.json import to_json
try:
JSONField = models.JSONField
except AttributeError:
import jsonfield
JSONField = jsonfield.JSONField
def forwards_func(apps, schema_editor):
from core.models import EXTRACTORS
@@ -76,7 +82,7 @@ class Migration(migrations.Migration):
name='ArchiveResult',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('cmd', models.JSONField()),
('cmd', JSONField()),
('pwd', models.CharField(max_length=256)),
('cmd_version', models.CharField(max_length=32)),
('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),

View File

@@ -18,6 +18,12 @@ STATUS_CHOICES = [
("skipped", "skipped")
]
try:
JSONField = models.JSONField
except AttributeError:
import jsonfield
JSONField = jsonfield.JSONField
class Tag(models.Model):
"""
@@ -173,7 +179,7 @@ class ArchiveResultManager(models.Manager):
class ArchiveResult(models.Model):
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
cmd = models.JSONField()
cmd = JSONField()
pwd = models.CharField(max_length=256)
cmd_version = models.CharField(max_length=32)
output = models.CharField(max_length=512)

View File

@@ -12,6 +12,7 @@ from ..config import (
ALLOWED_HOSTS,
PACKAGE_DIR,
ACTIVE_THEME,
TEMPLATES_DIR_NAME,
SQL_INDEX_FILENAME,
OUTPUT_DIR,
)
@@ -68,14 +69,14 @@ AUTHENTICATION_BACKENDS = [
STATIC_URL = '/static/'
STATICFILES_DIRS = [
str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME / 'static'),
str(Path(PACKAGE_DIR) / 'themes' / 'default' / 'static'),
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME / 'static'),
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default' / 'static'),
]
TEMPLATE_DIRS = [
str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME),
str(Path(PACKAGE_DIR) / 'themes' / 'default'),
str(Path(PACKAGE_DIR) / 'themes'),
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME),
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default'),
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME),
]
TEMPLATES = [

View File

@@ -150,12 +150,15 @@ class AddView(UserPassesTestMixin, FormView):
url = form.cleaned_data["url"]
print(f'[+] Adding URL: {url}')
depth = 0 if form.cleaned_data["depth"] == "0" else 1
extractors = ','.join(form.cleaned_data["archive_methods"])
input_kwargs = {
"urls": url,
"depth": depth,
"update_all": False,
"out_dir": OUTPUT_DIR,
}
if extractors:
input_kwargs.update({"extractors": extractors})
add_stdout = StringIO()
with redirect_stdout(add_stdout):
add(**input_kwargs)

View File

@@ -20,7 +20,6 @@ from ..config import (
CURL_ARGS,
CURL_VERSION,
CURL_USER_AGENT,
setup_django,
)
from ..logging_util import TimedProgress
@@ -81,7 +80,6 @@ def extract_title_with_regex(html):
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content"""
setup_django(out_dir=out_dir)
from core.models import Snapshot
output: ArchiveOutput = None

View File

@@ -18,7 +18,6 @@ from ..util import (
ExtendedEncoder,
)
from ..config import (
setup_django,
ARCHIVE_DIR_NAME,
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
@@ -243,16 +242,9 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
log_indexing_process_finished()
@enforce_types
def get_empty_snapshot_queryset(out_dir: Path=OUTPUT_DIR):
setup_django(out_dir, check_db=True)
from core.models import Snapshot
return Snapshot.objects.none()
@enforce_types
def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in"""
setup_django(out_dir, check_db=True)
from core.models import Snapshot
try:
return Snapshot.objects.all()
@@ -390,8 +382,9 @@ def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type:
color='red',
)
raise SystemExit(2)
from core.models import Snapshot
qsearch = get_empty_snapshot_queryset()
qsearch = Snapshot.objects.none()
for pattern in filter_patterns:
try:
qsearch |= query_search_index(pattern)

View File

@@ -23,7 +23,6 @@ from ..config import (
GIT_SHA,
FOOTER_INFO,
HTML_INDEX_FILENAME,
setup_django,
)
MAIN_INDEX_TEMPLATE = 'main_index.html'
@@ -111,7 +110,6 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
"""render a given html template string with the given template content"""
from django.template.loader import render_to_string
setup_django(check_db=False)
return render_to_string(template, context)

View File

@@ -9,7 +9,6 @@ DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
__package__ = 'archivebox.index'
from pathlib import Path
from django.db.utils import OperationalError
from datetime import datetime, timedelta

View File

@@ -19,6 +19,7 @@ if TYPE_CHECKING:
from .util import enforce_types
from .config import (
ConfigDict,
OUTPUT_DIR,
PYTHON_ENCODING,
ANSI,
IS_TTY,
@@ -514,19 +515,24 @@ def printable_folder_status(name: str, folder: Dict) -> str:
else:
num_files = 'missing'
if ' ' in str(folder['path']):
folder['path'] = f'"{folder["path"]}"'
path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
if path and ' ' in path:
path = f'"{path}"'
# if path is just a plain dot, replace it back with the full path for clarity
if path == '.':
path = str(OUTPUT_DIR)
return ' '.join((
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(22),
(str(folder["path"]) or '').ljust(76),
name.ljust(21),
num_files.ljust(14),
ANSI[color],
note,
note.ljust(8),
ANSI['reset'],
path.ljust(76),
))
@@ -546,17 +552,18 @@ def printable_dependency_version(name: str, dependency: Dict) -> str:
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
if ' ' in (dependency["path"] or ''):
dependency["path"] = f'"{dependency["path"]}"'
path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else ''
if path and ' ' in path:
path = f'"{path}"'
return ' '.join((
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(22),
(dependency["path"] or '').ljust(76),
name.ljust(21),
version.ljust(14),
ANSI[color],
note,
note.ljust(8),
ANSI['reset'],
path.ljust(76),
))

View File

@@ -29,7 +29,6 @@ from .util import enforce_types # type: ignore
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
from .index import (
load_main_index,
get_empty_snapshot_queryset,
parse_links_from_source,
dedupe_links,
write_main_index,
@@ -218,7 +217,7 @@ def version(quiet: bool=False,
else:
print('ArchiveBox v{}'.format(VERSION))
p = platform.uname()
print(p.system, platform.platform(), p.machine)
print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, '(in Docker)' if IN_DOCKER else '(not in Docker)')
print()
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
@@ -265,6 +264,7 @@ def run(subcommand: str,
@enforce_types
def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
"""Initialize a new ArchiveBox collection in the current directory"""
from core.models import Snapshot
Path(out_dir).mkdir(exist_ok=True)
is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
@@ -335,7 +335,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
print()
print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
all_links = get_empty_snapshot_queryset()
all_links = Snapshot.objects.none()
pending_links: Dict[str, Link] = {}
if existing_index:
@@ -511,7 +511,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
@enforce_types
def oneshot(url: str, out_dir: Path=OUTPUT_DIR):
def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
"""
Create a single URL archive folder with an index.json and index.html, and all the archive method outputs.
You can run this to archive single pages without needing to create a whole collection with archivebox init.
@@ -523,7 +523,8 @@ def oneshot(url: str, out_dir: Path=OUTPUT_DIR):
color='red'
)
raise SystemExit(2)
methods = ignore_methods(['title'])
methods = extractors.split(",") if extractors else ignore_methods(['title'])
archive_link(oneshot_link[0], out_dir=out_dir, methods=methods)
return oneshot_link
@@ -534,8 +535,8 @@ def add(urls: Union[str, List[str]],
index_only: bool=False,
overwrite: bool=False,
init: bool=False,
out_dir: Path=OUTPUT_DIR,
extractors: str="") -> List[Link]:
extractors: str="",
out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Add a new URL or list of URLs to your archive"""
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'

View File

@@ -6,7 +6,7 @@ from django.db.models import QuerySet
from archivebox.index.schema import Link
from archivebox.util import enforce_types
from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
from archivebox.config import stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
from .utils import get_indexable_content, log_index_started
@@ -49,7 +49,6 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
@enforce_types
def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
setup_django(out_dir, check_db=True)
from core.models import Snapshot
if search_backend_enabled():
@@ -107,4 +106,3 @@ def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
)
else:
write_search_index(link, texts, out_dir=out_dir)

View File

@@ -1,8 +1,8 @@
import re
from subprocess import run, PIPE, DEVNULL
from subprocess import run, PIPE
from typing import List, Generator
from archivebox.config import setup_django, ARCHIVE_DIR
from archivebox.config import ARCHIVE_DIR, RIPGREP_VERSION
from archivebox.util import enforce_types
RG_IGNORE_EXTENSIONS = ('css','js','orig','svg')
@@ -26,11 +26,9 @@ def flush(snapshot_ids: Generator[str, None, None]):
@enforce_types
def search(text: str) -> List[str]:
is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL)
if is_rg_installed.returncode:
if not RIPGREP_VERSION:
raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
setup_django(check_db=True)
from core.models import Snapshot
rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)]
@@ -45,4 +43,3 @@ def search(text: str) -> List[str]:
snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)]
return snap_ids