mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-03 06:17:53 +10:00
Merge branch 'v0.5.0' of github.com:ArchiveBox/ArchiveBox into feat-snapshots-grid
This commit is contained in:
@@ -63,7 +63,7 @@ def run_subcommand(subcommand: str,
|
||||
|
||||
if subcommand not in meta_cmds:
|
||||
from ..config import setup_django
|
||||
setup_django(in_memory_db=subcommand in fake_db)
|
||||
setup_django(in_memory_db=subcommand in fake_db, check_db=subcommand in archive_cmds)
|
||||
|
||||
module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
||||
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
|
||||
|
||||
@@ -89,8 +89,8 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||
index_only=command.index_only,
|
||||
overwrite=command.overwrite,
|
||||
init=command.init,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
extractors=command.extract,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -36,6 +36,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||
' ~/Desktop/sites_list.csv\n'
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--extract",
|
||||
type=str,
|
||||
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
|
||||
This does not take precedence over the configuration",
|
||||
default=""
|
||||
)
|
||||
parser.add_argument(
|
||||
'--out-dir',
|
||||
type=str,
|
||||
@@ -55,6 +62,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||
oneshot(
|
||||
url=stdin_url or url,
|
||||
out_dir=Path(command.out_dir).resolve(),
|
||||
extractors=command.extract,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -161,6 +161,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
||||
'USE_CHROME': {'type': bool, 'default': True},
|
||||
'USE_NODE': {'type': bool, 'default': True},
|
||||
'USE_YOUTUBEDL': {'type': bool, 'default': True},
|
||||
'USE_RIPGREP': {'type': bool, 'default': True},
|
||||
|
||||
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
||||
'GIT_BINARY': {'type': str, 'default': 'git'},
|
||||
@@ -170,6 +171,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
||||
'MERCURY_BINARY': {'type': str, 'default': 'mercury-parser'},
|
||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
||||
'CHROME_BINARY': {'type': str, 'default': None},
|
||||
|
||||
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
||||
@@ -275,7 +277,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
|
||||
'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}},
|
||||
|
||||
'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent},
|
||||
'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME / 'legacy'},
|
||||
'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
|
||||
|
||||
'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
|
||||
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
|
||||
@@ -312,6 +314,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
|
||||
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
|
||||
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
||||
|
||||
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
|
||||
|
||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
||||
@@ -320,7 +323,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
|
||||
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
|
||||
|
||||
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
|
||||
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if (c['USE_MERCURY'] and c['MERCURY_BINARY']) else None}, # mercury is unversioned
|
||||
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury is unversioned
|
||||
|
||||
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
||||
@@ -334,8 +337,6 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
|
||||
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
|
||||
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
|
||||
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
|
||||
'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])},
|
||||
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
|
||||
|
||||
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
|
||||
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
|
||||
@@ -343,6 +344,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
|
||||
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']},
|
||||
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
|
||||
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
|
||||
|
||||
'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])},
|
||||
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
|
||||
|
||||
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||
@@ -595,7 +599,7 @@ def bin_path(binary: Optional[str]) -> Optional[str]:
|
||||
if node_modules_bin.exists():
|
||||
return str(node_modules_bin.resolve())
|
||||
|
||||
return shutil.which(Path(binary).expanduser()) or binary
|
||||
return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
|
||||
|
||||
def bin_hash(binary: Optional[str]) -> Optional[str]:
|
||||
if binary is None:
|
||||
@@ -682,7 +686,7 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
||||
'TEMPLATES_DIR': {
|
||||
'path': (config['TEMPLATES_DIR']).resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
|
||||
'is_valid': (config['TEMPLATES_DIR'] / config['ACTIVE_THEME'] / 'static').exists(),
|
||||
},
|
||||
# 'NODE_MODULES_DIR': {
|
||||
# 'path': ,
|
||||
@@ -826,6 +830,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
'enabled': config['USE_CHROME'],
|
||||
'is_valid': bool(config['CHROME_VERSION']),
|
||||
},
|
||||
'RIPGREP_BINARY': {
|
||||
'path': bin_path(config['RIPGREP_BINARY']),
|
||||
'version': config['RIPGREP_VERSION'],
|
||||
'hash': bin_hash(config['RIPGREP_BINARY']),
|
||||
'enabled': config['USE_RIPGREP'],
|
||||
'is_valid': bool(config['RIPGREP_VERSION']),
|
||||
},
|
||||
}
|
||||
|
||||
def get_chrome_info(config: ConfigDict) -> ConfigValue:
|
||||
|
||||
@@ -10,11 +10,22 @@ CHOICES = (
|
||||
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
|
||||
)
|
||||
|
||||
from ..extractors import get_default_archive_methods
|
||||
|
||||
ARCHIVE_METHODS = [
|
||||
(name, name)
|
||||
for name, _, _ in get_default_archive_methods()
|
||||
]
|
||||
|
||||
|
||||
class AddLinkForm(forms.Form):
|
||||
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
|
||||
depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
|
||||
|
||||
|
||||
archive_methods = forms.MultipleChoiceField(
|
||||
required=False,
|
||||
widget=forms.SelectMultiple,
|
||||
choices=ARCHIVE_METHODS,
|
||||
)
|
||||
class TagWidgetMixin:
|
||||
def format_value(self, value):
|
||||
if value is not None and not isinstance(value, str):
|
||||
|
||||
@@ -9,6 +9,12 @@ import django.db.models.deletion
|
||||
from config import CONFIG
|
||||
from index.json import to_json
|
||||
|
||||
try:
|
||||
JSONField = models.JSONField
|
||||
except AttributeError:
|
||||
import jsonfield
|
||||
JSONField = jsonfield.JSONField
|
||||
|
||||
|
||||
def forwards_func(apps, schema_editor):
|
||||
from core.models import EXTRACTORS
|
||||
@@ -76,7 +82,7 @@ class Migration(migrations.Migration):
|
||||
name='ArchiveResult',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('cmd', models.JSONField()),
|
||||
('cmd', JSONField()),
|
||||
('pwd', models.CharField(max_length=256)),
|
||||
('cmd_version', models.CharField(max_length=32)),
|
||||
('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
|
||||
|
||||
@@ -18,6 +18,12 @@ STATUS_CHOICES = [
|
||||
("skipped", "skipped")
|
||||
]
|
||||
|
||||
try:
|
||||
JSONField = models.JSONField
|
||||
except AttributeError:
|
||||
import jsonfield
|
||||
JSONField = jsonfield.JSONField
|
||||
|
||||
|
||||
class Tag(models.Model):
|
||||
"""
|
||||
@@ -173,7 +179,7 @@ class ArchiveResultManager(models.Manager):
|
||||
|
||||
class ArchiveResult(models.Model):
|
||||
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
||||
cmd = models.JSONField()
|
||||
cmd = JSONField()
|
||||
pwd = models.CharField(max_length=256)
|
||||
cmd_version = models.CharField(max_length=32)
|
||||
output = models.CharField(max_length=512)
|
||||
|
||||
@@ -12,6 +12,7 @@ from ..config import (
|
||||
ALLOWED_HOSTS,
|
||||
PACKAGE_DIR,
|
||||
ACTIVE_THEME,
|
||||
TEMPLATES_DIR_NAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
OUTPUT_DIR,
|
||||
)
|
||||
@@ -68,14 +69,14 @@ AUTHENTICATION_BACKENDS = [
|
||||
STATIC_URL = '/static/'
|
||||
|
||||
STATICFILES_DIRS = [
|
||||
str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME / 'static'),
|
||||
str(Path(PACKAGE_DIR) / 'themes' / 'default' / 'static'),
|
||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME / 'static'),
|
||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default' / 'static'),
|
||||
]
|
||||
|
||||
TEMPLATE_DIRS = [
|
||||
str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME),
|
||||
str(Path(PACKAGE_DIR) / 'themes' / 'default'),
|
||||
str(Path(PACKAGE_DIR) / 'themes'),
|
||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME),
|
||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default'),
|
||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME),
|
||||
]
|
||||
|
||||
TEMPLATES = [
|
||||
|
||||
@@ -150,12 +150,15 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
url = form.cleaned_data["url"]
|
||||
print(f'[+] Adding URL: {url}')
|
||||
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
||||
extractors = ','.join(form.cleaned_data["archive_methods"])
|
||||
input_kwargs = {
|
||||
"urls": url,
|
||||
"depth": depth,
|
||||
"update_all": False,
|
||||
"out_dir": OUTPUT_DIR,
|
||||
}
|
||||
if extractors:
|
||||
input_kwargs.update({"extractors": extractors})
|
||||
add_stdout = StringIO()
|
||||
with redirect_stdout(add_stdout):
|
||||
add(**input_kwargs)
|
||||
|
||||
@@ -20,7 +20,6 @@ from ..config import (
|
||||
CURL_ARGS,
|
||||
CURL_VERSION,
|
||||
CURL_USER_AGENT,
|
||||
setup_django,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
@@ -81,7 +80,6 @@ def extract_title_with_regex(html):
|
||||
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""try to guess the page's title from its content"""
|
||||
|
||||
setup_django(out_dir=out_dir)
|
||||
from core.models import Snapshot
|
||||
|
||||
output: ArchiveOutput = None
|
||||
|
||||
@@ -18,7 +18,6 @@ from ..util import (
|
||||
ExtendedEncoder,
|
||||
)
|
||||
from ..config import (
|
||||
setup_django,
|
||||
ARCHIVE_DIR_NAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
JSON_INDEX_FILENAME,
|
||||
@@ -243,16 +242,9 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
||||
|
||||
log_indexing_process_finished()
|
||||
|
||||
@enforce_types
|
||||
def get_empty_snapshot_queryset(out_dir: Path=OUTPUT_DIR):
|
||||
setup_django(out_dir, check_db=True)
|
||||
from core.models import Snapshot
|
||||
return Snapshot.objects.none()
|
||||
|
||||
@enforce_types
|
||||
def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
|
||||
"""parse and load existing index with any new links from import_path merged in"""
|
||||
setup_django(out_dir, check_db=True)
|
||||
from core.models import Snapshot
|
||||
try:
|
||||
return Snapshot.objects.all()
|
||||
@@ -390,8 +382,9 @@ def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type:
|
||||
color='red',
|
||||
)
|
||||
raise SystemExit(2)
|
||||
from core.models import Snapshot
|
||||
|
||||
qsearch = get_empty_snapshot_queryset()
|
||||
qsearch = Snapshot.objects.none()
|
||||
for pattern in filter_patterns:
|
||||
try:
|
||||
qsearch |= query_search_index(pattern)
|
||||
|
||||
@@ -23,7 +23,6 @@ from ..config import (
|
||||
GIT_SHA,
|
||||
FOOTER_INFO,
|
||||
HTML_INDEX_FILENAME,
|
||||
setup_django,
|
||||
)
|
||||
|
||||
MAIN_INDEX_TEMPLATE = 'main_index.html'
|
||||
@@ -111,7 +110,6 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
|
||||
"""render a given html template string with the given template content"""
|
||||
from django.template.loader import render_to_string
|
||||
|
||||
setup_django(check_db=False)
|
||||
return render_to_string(template, context)
|
||||
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@ DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
|
||||
__package__ = 'archivebox.index'
|
||||
|
||||
from pathlib import Path
|
||||
from django.db.utils import OperationalError
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ if TYPE_CHECKING:
|
||||
from .util import enforce_types
|
||||
from .config import (
|
||||
ConfigDict,
|
||||
OUTPUT_DIR,
|
||||
PYTHON_ENCODING,
|
||||
ANSI,
|
||||
IS_TTY,
|
||||
@@ -514,19 +515,24 @@ def printable_folder_status(name: str, folder: Dict) -> str:
|
||||
else:
|
||||
num_files = 'missing'
|
||||
|
||||
if ' ' in str(folder['path']):
|
||||
folder['path'] = f'"{folder["path"]}"'
|
||||
path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
|
||||
if path and ' ' in path:
|
||||
path = f'"{path}"'
|
||||
|
||||
# if path is just a plain dot, replace it back with the full path for clarity
|
||||
if path == '.':
|
||||
path = str(OUTPUT_DIR)
|
||||
|
||||
return ' '.join((
|
||||
ANSI[color],
|
||||
symbol,
|
||||
ANSI['reset'],
|
||||
name.ljust(22),
|
||||
(str(folder["path"]) or '').ljust(76),
|
||||
name.ljust(21),
|
||||
num_files.ljust(14),
|
||||
ANSI[color],
|
||||
note,
|
||||
note.ljust(8),
|
||||
ANSI['reset'],
|
||||
path.ljust(76),
|
||||
))
|
||||
|
||||
|
||||
@@ -546,17 +552,18 @@ def printable_dependency_version(name: str, dependency: Dict) -> str:
|
||||
else:
|
||||
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
||||
|
||||
if ' ' in (dependency["path"] or ''):
|
||||
dependency["path"] = f'"{dependency["path"]}"'
|
||||
path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else ''
|
||||
if path and ' ' in path:
|
||||
path = f'"{path}"'
|
||||
|
||||
return ' '.join((
|
||||
ANSI[color],
|
||||
symbol,
|
||||
ANSI['reset'],
|
||||
name.ljust(22),
|
||||
(dependency["path"] or '').ljust(76),
|
||||
name.ljust(21),
|
||||
version.ljust(14),
|
||||
ANSI[color],
|
||||
note,
|
||||
note.ljust(8),
|
||||
ANSI['reset'],
|
||||
path.ljust(76),
|
||||
))
|
||||
|
||||
@@ -29,7 +29,6 @@ from .util import enforce_types # type: ignore
|
||||
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
|
||||
from .index import (
|
||||
load_main_index,
|
||||
get_empty_snapshot_queryset,
|
||||
parse_links_from_source,
|
||||
dedupe_links,
|
||||
write_main_index,
|
||||
@@ -218,7 +217,7 @@ def version(quiet: bool=False,
|
||||
else:
|
||||
print('ArchiveBox v{}'.format(VERSION))
|
||||
p = platform.uname()
|
||||
print(p.system, platform.platform(), p.machine)
|
||||
print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, '(in Docker)' if IN_DOCKER else '(not in Docker)')
|
||||
print()
|
||||
|
||||
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
|
||||
@@ -265,6 +264,7 @@ def run(subcommand: str,
|
||||
@enforce_types
|
||||
def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
||||
"""Initialize a new ArchiveBox collection in the current directory"""
|
||||
from core.models import Snapshot
|
||||
Path(out_dir).mkdir(exist_ok=True)
|
||||
is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
|
||||
|
||||
@@ -335,7 +335,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
||||
print()
|
||||
print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
|
||||
|
||||
all_links = get_empty_snapshot_queryset()
|
||||
all_links = Snapshot.objects.none()
|
||||
pending_links: Dict[str, Link] = {}
|
||||
|
||||
if existing_index:
|
||||
@@ -511,7 +511,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
|
||||
|
||||
|
||||
@enforce_types
|
||||
def oneshot(url: str, out_dir: Path=OUTPUT_DIR):
|
||||
def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
|
||||
"""
|
||||
Create a single URL archive folder with an index.json and index.html, and all the archive method outputs.
|
||||
You can run this to archive single pages without needing to create a whole collection with archivebox init.
|
||||
@@ -523,7 +523,8 @@ def oneshot(url: str, out_dir: Path=OUTPUT_DIR):
|
||||
color='red'
|
||||
)
|
||||
raise SystemExit(2)
|
||||
methods = ignore_methods(['title'])
|
||||
|
||||
methods = extractors.split(",") if extractors else ignore_methods(['title'])
|
||||
archive_link(oneshot_link[0], out_dir=out_dir, methods=methods)
|
||||
return oneshot_link
|
||||
|
||||
@@ -534,8 +535,8 @@ def add(urls: Union[str, List[str]],
|
||||
index_only: bool=False,
|
||||
overwrite: bool=False,
|
||||
init: bool=False,
|
||||
out_dir: Path=OUTPUT_DIR,
|
||||
extractors: str="") -> List[Link]:
|
||||
extractors: str="",
|
||||
out_dir: Path=OUTPUT_DIR) -> List[Link]:
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
|
||||
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||
|
||||
@@ -6,7 +6,7 @@ from django.db.models import QuerySet
|
||||
|
||||
from archivebox.index.schema import Link
|
||||
from archivebox.util import enforce_types
|
||||
from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
|
||||
from archivebox.config import stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
|
||||
|
||||
from .utils import get_indexable_content, log_index_started
|
||||
|
||||
@@ -49,7 +49,6 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
|
||||
|
||||
@enforce_types
|
||||
def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
|
||||
setup_django(out_dir, check_db=True)
|
||||
from core.models import Snapshot
|
||||
|
||||
if search_backend_enabled():
|
||||
@@ -107,4 +106,3 @@ def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
|
||||
)
|
||||
else:
|
||||
write_search_index(link, texts, out_dir=out_dir)
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import re
|
||||
from subprocess import run, PIPE, DEVNULL
|
||||
from subprocess import run, PIPE
|
||||
from typing import List, Generator
|
||||
|
||||
from archivebox.config import setup_django, ARCHIVE_DIR
|
||||
from archivebox.config import ARCHIVE_DIR, RIPGREP_VERSION
|
||||
from archivebox.util import enforce_types
|
||||
|
||||
RG_IGNORE_EXTENSIONS = ('css','js','orig','svg')
|
||||
@@ -26,11 +26,9 @@ def flush(snapshot_ids: Generator[str, None, None]):
|
||||
|
||||
@enforce_types
|
||||
def search(text: str) -> List[str]:
|
||||
is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL)
|
||||
if is_rg_installed.returncode:
|
||||
if not RIPGREP_VERSION:
|
||||
raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
|
||||
|
||||
setup_django(check_db=True)
|
||||
from core.models import Snapshot
|
||||
|
||||
rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)]
|
||||
@@ -45,4 +43,3 @@ def search(text: str) -> List[str]:
|
||||
snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)]
|
||||
|
||||
return snap_ids
|
||||
|
||||
|
||||
Reference in New Issue
Block a user