Merge branch 'master' into tags

This commit is contained in:
Cristian Vargas
2020-10-20 08:23:25 -05:00
committed by GitHub
31 changed files with 651 additions and 396 deletions

View File

@@ -6,12 +6,13 @@ import sys
import argparse
from typing import Optional, Dict, List, IO
from pathlib import Path
from ..config import OUTPUT_DIR
from importlib import import_module
CLI_DIR = os.path.dirname(os.path.abspath(__file__))
CLI_DIR = Path(__file__).resolve().parent
# these common commands will appear sorted before any others for ease-of-use
meta_cmds = ('help', 'version')

View File

@@ -7,6 +7,7 @@ import os
import sys
import shutil
import unittest
from pathlib import Path
from contextlib import contextmanager
@@ -109,13 +110,13 @@ class TestInit(unittest.TestCase):
with output_hidden():
archivebox_init.main([])
assert os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
assert os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
assert os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
assert (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
assert (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
assert (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0
def test_conflicting_init(self):
with open(os.path.join(OUTPUT_DIR, 'test_conflict.txt'), 'w+') as f:
with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+') as f:
f.write('test')
try:
@@ -125,9 +126,9 @@ class TestInit(unittest.TestCase):
except SystemExit:
pass
assert not os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
assert not os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
assert not os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
assert not (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
assert not (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
assert not (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
try:
load_main_index(out_dir=OUTPUT_DIR)
assert False, 'load_main_index should raise an exception when no index is present'
@@ -159,7 +160,7 @@ class TestAdd(unittest.TestCase):
assert len(all_links) == 30
def test_add_arg_file(self):
test_file = os.path.join(OUTPUT_DIR, 'test.txt')
test_file = Path(OUTPUT_DIR) / 'test.txt'
with open(test_file, 'w+') as f:
f.write(test_urls)

View File

@@ -431,7 +431,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
with open(f'{config_path}.bak', 'r') as old:
atomic_write(config_path, old.read())
if os.path.exists(f'{config_path}.bak'):
if Path(f'{config_path}.bak').exists():
os.remove(f'{config_path}.bak')
return {}
@@ -540,7 +540,7 @@ def bin_path(binary: Optional[str]) -> Optional[str]:
if node_modules_bin.exists():
return str(node_modules_bin.resolve())
return shutil.which(os.path.expanduser(binary)) or binary
return shutil.which(Path(binary).expanduser()) or binary
def bin_hash(binary: Optional[str]) -> Optional[str]:
if binary is None:
@@ -634,17 +634,17 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
}
def get_external_locations(config: ConfigDict) -> ConfigValue:
abspath = lambda path: None if path is None else os.path.abspath(path)
abspath = lambda path: None if path is None else Path(path).resolve()
return {
'CHROME_USER_DATA_DIR': {
'path': abspath(config['CHROME_USER_DATA_DIR']),
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')),
'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
},
'COOKIES_FILE': {
'path': abspath(config['COOKIES_FILE']),
'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
'is_valid': False if config['COOKIES_FILE'] is None else os.path.exists(config['COOKIES_FILE']),
'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
},
}
@@ -828,7 +828,7 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
# stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
# stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
if config['CHROME_USER_DATA_DIR'] is not None:
if not os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')):
if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists():
stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
stderr(f' {config["CHROME_USER_DATA_DIR"]}')
stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')

View File

@@ -2,6 +2,7 @@ __package__ = 'archivebox.core'
import os
import sys
from pathlib import Path
from django.utils.crypto import get_random_string
@@ -49,9 +50,9 @@ TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [
os.path.join(PYTHON_DIR, 'themes', ACTIVE_THEME),
os.path.join(PYTHON_DIR, 'themes', 'default'),
os.path.join(PYTHON_DIR, 'themes'),
str(Path(PYTHON_DIR) / 'themes' / ACTIVE_THEME),
str(Path(PYTHON_DIR) / 'themes' / 'default'),
str(Path(PYTHON_DIR) / 'themes'),
],
'APP_DIRS': True,
'OPTIONS': {
@@ -70,7 +71,7 @@ WSGI_APPLICATION = 'core.wsgi.application'
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME),
'NAME': str(Path(OUTPUT_DIR) / SQL_INDEX_FILENAME),
}
}
@@ -105,7 +106,7 @@ SHELL_PLUS_PRINT_SQL = False
IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner']
IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell'
if IS_SHELL:
os.environ['PYTHONSTARTUP'] = os.path.join(PYTHON_DIR, 'core', 'welcome_message.py')
os.environ['PYTHONSTARTUP'] = str(Path(PYTHON_DIR) / 'core' / 'welcome_message.py')
LANGUAGE_CODE = 'en-us'
@@ -122,6 +123,6 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
STATIC_URL = '/static/'
STATICFILES_DIRS = [
os.path.join(PYTHON_DIR, 'themes', ACTIVE_THEME, 'static'),
os.path.join(PYTHON_DIR, 'themes', 'default', 'static'),
str(Path(PYTHON_DIR) / 'themes' / ACTIVE_THEME / 'static'),
str(Path(PYTHON_DIR) / 'themes' / 'default' / 'static'),
]

View File

@@ -14,11 +14,11 @@ def get_icons(snapshot: Snapshot) -> str:
return format_html(
'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
'<a href="/{}/{}/" class="exists-{}" title="Wget clone">🌐 </a> '
'<a href="/{}/{}" class="exists-{}" title="Wget clone">🌐 </a> '
'<a href="/{}/{}" class="exists-{}" title="PDF">📄</a> '
'<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
'<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
'<a href="/{}/{}/" class="exists-{}" title="WARC">🆆 </a> '
'<a href="/{}/{}" class="exists-{}" title="WARC">🆆 </a> '
'<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
'<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
'<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '

View File

@@ -114,12 +114,23 @@ class AddView(UserPassesTestMixin, FormView):
template_name = "add_links.html"
form_class = AddLinkForm
def get_initial(self):
"""Prefill the AddLinkForm with the 'url' GET parameter"""
if self.request.method == 'GET':
url = self.request.GET.get('url', None)
if url:
return {'url': url}
else:
return super().get_initial()
def test_func(self):
return PUBLIC_ADD_VIEW or self.request.user.is_authenticated
def get_context_data(self, *args, **kwargs):
context = super().get_context_data(*args, **kwargs)
context["title"] = "Add URLs"
# We can't just call request.build_absolute_uri in the template, because it would include query parameters
context["absolute_add_path"] = self.request.build_absolute_uri(self.request.path)
return context
def form_valid(self, form):

View File

@@ -75,7 +75,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
out_dir = out_dir or Path(link.link_dir)
try:
is_new = not os.path.exists(out_dir)
is_new = not Path(out_dir).exists()
if is_new:
os.makedirs(out_dir)

View File

@@ -1,6 +1,5 @@
__package__ = 'archivebox.extractors'
import os
from pathlib import Path
from typing import Optional
@@ -22,7 +21,7 @@ from ..logging_util import TimedProgress
@enforce_types
def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
if (Path(out_dir) / 'favicon.ico').exists():
return False
return SAVE_FAVICON

View File

@@ -179,7 +179,7 @@ def wget_output_path(link: Link) -> Optional[str]:
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
]
if html_files:
return str(html_files[0])
return str(html_files[0].relative_to(link.link_dir))
# Move up one directory level
search_dir = search_dir.parent

View File

@@ -575,7 +575,7 @@ def is_archived(link: Link) -> bool:
return is_valid(link) and link.is_archived
def is_unarchived(link: Link) -> bool:
if not os.path.exists(link.link_dir):
if not Path(link.link_dir).exists():
return True
return not link.is_archived

View File

@@ -1,7 +1,5 @@
__package__ = 'archivebox.index'
import os
from string import Template
from datetime import datetime
from typing import List, Optional, Iterator, Mapping
@@ -30,11 +28,10 @@ from ..config import (
FAVICON_FILENAME,
)
join = lambda *paths: os.path.join(*paths)
MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')
MINIMAL_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index_minimal.html')
MAIN_INDEX_ROW_TEMPLATE = join(TEMPLATES_DIR, 'main_index_row.html')
LINK_DETAILS_TEMPLATE = join(TEMPLATES_DIR, 'link_details.html')
MAIN_INDEX_TEMPLATE = str(Path(TEMPLATES_DIR) / 'main_index.html')
MINIMAL_INDEX_TEMPLATE = str(Path(TEMPLATES_DIR) / 'main_index_minimal.html')
MAIN_INDEX_ROW_TEMPLATE = str(Path(TEMPLATES_DIR) / 'main_index_row.html')
LINK_DETAILS_TEMPLATE = str(Path(TEMPLATES_DIR) / 'link_details.html')
TITLE_LOADING_MSG = 'Not yet archived...'
@@ -44,8 +41,8 @@ TITLE_LOADING_MSG = 'Not yet archived...'
def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
"""parse an archive index html file and return the list of urls"""
index_path = join(out_dir, HTML_INDEX_FILENAME)
if os.path.exists(index_path):
index_path = Path(out_dir) / HTML_INDEX_FILENAME
if index_path.exists():
with open(index_path, 'r', encoding='utf-8') as f:
for line in f:
if 'class="link-url"' in line:
@@ -56,12 +53,12 @@ def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
def write_html_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool=False) -> None:
"""write the html link index to a given path"""
copy_and_overwrite(join(TEMPLATES_DIR, FAVICON_FILENAME), join(out_dir, FAVICON_FILENAME))
copy_and_overwrite(join(TEMPLATES_DIR, ROBOTS_TXT_FILENAME), join(out_dir, ROBOTS_TXT_FILENAME))
copy_and_overwrite(join(TEMPLATES_DIR, STATIC_DIR_NAME), join(out_dir, STATIC_DIR_NAME))
copy_and_overwrite(str(Path(TEMPLATES_DIR) / FAVICON_FILENAME), str(out_dir / FAVICON_FILENAME))
copy_and_overwrite(str(Path(TEMPLATES_DIR) / ROBOTS_TXT_FILENAME), str(out_dir / ROBOTS_TXT_FILENAME))
copy_and_overwrite(str(Path(TEMPLATES_DIR) / STATIC_DIR_NAME), str(out_dir / STATIC_DIR_NAME))
rendered_html = main_index_template(links, finished=finished)
atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)
atomic_write(str(out_dir / HTML_INDEX_FILENAME), rendered_html)
@enforce_types
@@ -100,7 +97,7 @@ def main_index_row_template(link: Link) -> str:
# before pages are finished archiving, show fallback loading favicon
'favicon_url': (
join(ARCHIVE_DIR_NAME, link.timestamp, 'favicon.ico')
str(Path(ARCHIVE_DIR_NAME) / link.timestamp / 'favicon.ico')
# if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs='
),
@@ -119,7 +116,7 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
rendered_html = link_details_template(link)
atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)
atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
@enforce_types

View File

@@ -45,8 +45,8 @@ MAIN_INDEX_HEADER = {
def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
"""parse an archive index json file and return the list of links"""
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(index_path):
index_path = Path(out_dir) / JSON_INDEX_FILENAME
if index_path.exists():
with open(index_path, 'r', encoding='utf-8') as f:
links = pyjson.load(f)['links']
for link_json in links:
@@ -86,7 +86,7 @@ def write_json_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
'last_run_cmd': sys.argv,
'links': links,
}
atomic_write(os.path.join(out_dir, JSON_INDEX_FILENAME), main_index_json)
atomic_write(str(Path(out_dir) / JSON_INDEX_FILENAME), main_index_json)
### Link Details Index
@@ -96,15 +96,15 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link"""
out_dir = out_dir or link.link_dir
path = os.path.join(out_dir, JSON_INDEX_FILENAME)
atomic_write(path, link._asdict(extended=True))
path = Path(out_dir) / JSON_INDEX_FILENAME
atomic_write(str(path), link._asdict(extended=True))
@enforce_types
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
"""load the json link index from a given directory"""
existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(existing_index):
existing_index = Path(out_dir) / JSON_INDEX_FILENAME
if existing_index.exists():
with open(existing_index, 'r', encoding='utf-8') as f:
try:
link_json = pyjson.load(f)
@@ -118,9 +118,9 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal
def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
"""read through all the archive data folders and return the parsed links"""
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
if entry.is_dir(follow_symlinks=True):
if os.path.exists(os.path.join(entry.path, 'index.json')):
if (Path(entry.path) / 'index.json').exists():
try:
link = parse_json_link_details(entry.path)
except KeyError:

View File

@@ -1,6 +1,5 @@
__package__ = 'archivebox.index'
import os
from pathlib import Path
from datetime import datetime, timedelta
@@ -250,7 +249,7 @@ class Link:
@property
def link_dir(self) -> str:
from ..config import CONFIG
return os.path.join(CONFIG['ARCHIVE_DIR'], self.timestamp)
return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp)
@property
def archive_path(self) -> str:
@@ -369,7 +368,7 @@ class Link:
)
return any(
os.path.exists(os.path.join(ARCHIVE_DIR, self.timestamp, path))
(Path(ARCHIVE_DIR) / self.timestamp / path).exists()
for path in output_paths
)

View File

@@ -390,7 +390,7 @@ def log_list_finished(links):
def log_removal_started(links: List["Link"], yes: bool, delete: bool):
print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(links), **ANSI))
if delete:
file_counts = [link.num_outputs for link in links if os.path.exists(link.link_dir)]
file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
print(
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
@@ -445,9 +445,9 @@ def log_shell_welcome_msg():
@enforce_types
def pretty_path(path: Union[Path, str]) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
pwd = os.path.abspath('.')
pwd = Path('.').resolve()
# parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
return str(path).replace(pwd + '/', './')
return str(path).replace(str(pwd) + '/', './')
@enforce_types
@@ -518,11 +518,11 @@ def printable_folder_status(name: str, folder: Dict) -> str:
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
if folder['path']:
if os.path.exists(folder['path']):
if Path(folder['path']).exists():
num_files = (
f'{len(os.listdir(folder["path"]))} files'
if os.path.isdir(folder['path']) else
printable_filesize(os.path.getsize(folder['path']))
if Path(folder['path']).is_dir() else
printable_filesize(Path(folder['path']).stat().st_size)
)
else:
num_files = 'missing'

View File

@@ -8,7 +8,6 @@ For examples of supported import formats see tests/.
__package__ = 'archivebox.parsers'
import re
import os
from io import StringIO
from typing import IO, Tuple, List, Optional
@@ -128,7 +127,7 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None)
@enforce_types
def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str:
ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(out_dir, SOURCES_DIR_NAME, filename.format(ts=ts))
source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts))
atomic_write(source_path, raw_text)
log_source_saved(source_file=source_path)
return source_path
@@ -138,7 +137,7 @@ def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir:
def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: Path=OUTPUT_DIR) -> str:
"""download a given url's content into output/sources/domain-<timestamp>.txt"""
ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(basename=basename(path), ts=ts))
source_path = str(OUTPUT_DIR / SOURCES_DIR_NAME / filename.format(basename=basename(path), ts=ts))
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
# Source is a URL that needs to be downloaded

View File

@@ -64,7 +64,7 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS) ->
@enforce_types
def copy_and_overwrite(from_path: str, to_path: str):
"""copy a given file or directory to a given path, overwriting the destination"""
if os.path.isdir(from_path):
if Path(from_path).is_dir():
shutil.rmtree(to_path, ignore_errors=True)
shutil.copytree(from_path, to_path)
else:

View File

@@ -49,6 +49,12 @@
<small>(it's safe to leave this page, adding will continue in the background)</small>
</div>
</center>
{% if absolute_add_path %}
<center id="bookmarklet">
<p>Bookmark this link to quickly add to your archive:
<a href="javascript:void(window.open('{{ absolute_add_path }}?url='+document.location.href));">Add to ArchiveBox</a></p>
</center>
{% endif %}
<script>
document.getElementById('add-form').addEventListener('submit', function(event) {
setTimeout(function() {

View File

@@ -2,7 +2,7 @@
<td title="$timestamp">$bookmarked_date</td>
<td class="title-col">
<a href="$archive_path/index.html" class="link-url"><img src="$favicon_url" class="link-favicon" decoding="async"></a>
<a href="$wget_url" title="$title">
<a href="$archive_path/$wget_url" title="$title">
<span data-title-for="$url" data-archived="$is_archived">$title</span>
<small style="float:right">$tags</small>
</a>