mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
first attempt to migrate to Pathlib
This commit is contained in:
committed by
Cristian Vargas
parent
2767155e59
commit
594d9e49ce
@@ -6,6 +6,7 @@ import json as pyjson
|
||||
from pathlib import Path
|
||||
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Dict, Optional, Iterable
|
||||
from collections import OrderedDict
|
||||
from contextlib import contextmanager
|
||||
@@ -224,7 +225,7 @@ def timed_index_update(out_path: str):
|
||||
|
||||
|
||||
@enforce_types
|
||||
def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
|
||||
def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool=False) -> None:
|
||||
"""create index.html file for a given list of links"""
|
||||
|
||||
log_indexing_process_started(len(links))
|
||||
@@ -260,7 +261,7 @@ def get_empty_snapshot_queryset(out_dir: str=OUTPUT_DIR):
|
||||
return Snapshot.objects.none()
|
||||
|
||||
@enforce_types
|
||||
def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]:
|
||||
def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
|
||||
"""parse and load existing index with any new links from import_path merged in"""
|
||||
setup_django(out_dir, check_db=True)
|
||||
from core.models import Snapshot
|
||||
@@ -271,7 +272,7 @@ def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]:
|
||||
raise SystemExit(0)
|
||||
|
||||
@enforce_types
|
||||
def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
|
||||
def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:
|
||||
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
||||
if os.path.exists(index_path):
|
||||
with open(index_path, 'r', encoding='utf-8') as f:
|
||||
@@ -392,7 +393,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
|
||||
return snapshots.filter(q_filter)
|
||||
|
||||
|
||||
def get_indexed_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_indexed_folders(links, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links without checking archive status or data directory validity"""
|
||||
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
||||
return {
|
||||
@@ -400,7 +401,7 @@ def get_indexed_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optiona
|
||||
for link in links
|
||||
}
|
||||
|
||||
def get_archived_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_archived_folders(links, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are archived with a valid data directory"""
|
||||
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
||||
return {
|
||||
@@ -408,7 +409,7 @@ def get_archived_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Option
|
||||
for link in filter(is_archived, links)
|
||||
}
|
||||
|
||||
def get_unarchived_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_unarchived_folders(links, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
||||
return {
|
||||
@@ -416,7 +417,7 @@ def get_unarchived_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Opti
|
||||
for link in filter(is_unarchived, links)
|
||||
}
|
||||
|
||||
def get_present_folders(_snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_present_folders(links, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that actually exist in the archive/ folder"""
|
||||
|
||||
all_folders = {}
|
||||
@@ -433,7 +434,7 @@ def get_present_folders(_snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Option
|
||||
|
||||
return all_folders
|
||||
|
||||
def get_valid_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_valid_folders(links, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs with a valid index matched to the main index and archived content"""
|
||||
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
||||
return {
|
||||
@@ -441,7 +442,7 @@ def get_valid_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[
|
||||
for link in filter(is_valid, links)
|
||||
}
|
||||
|
||||
def get_invalid_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_invalid_folders(links, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
|
||||
duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR)
|
||||
orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR)
|
||||
@@ -450,7 +451,7 @@ def get_invalid_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optiona
|
||||
return {**duplicate, **orphaned, **corrupted, **unrecognized}
|
||||
|
||||
|
||||
def get_duplicate_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_duplicate_folders(links, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that conflict with other directories that have the same link URL or timestamp"""
|
||||
by_url = {}
|
||||
by_timestamp = {}
|
||||
@@ -484,7 +485,7 @@ def get_duplicate_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optio
|
||||
duplicate_folders[path] = link
|
||||
return duplicate_folders
|
||||
|
||||
def get_orphaned_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_orphaned_folders(links, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that contain a valid index but aren't listed in the main index"""
|
||||
orphaned_folders = {}
|
||||
|
||||
@@ -502,7 +503,7 @@ def get_orphaned_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Option
|
||||
|
||||
return orphaned_folders
|
||||
|
||||
def get_corrupted_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that don't contain a valid index and aren't listed in the main index"""
|
||||
corrupted = {}
|
||||
for snapshot in snapshots.iterator():
|
||||
@@ -511,7 +512,7 @@ def get_corrupted_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optio
|
||||
corrupted[link.link_dir] = link
|
||||
return corrupted
|
||||
|
||||
def get_unrecognized_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
|
||||
unrecognized_folders: Dict[str, Optional[Link]] = {}
|
||||
|
||||
@@ -580,7 +581,7 @@ def is_unarchived(link: Link) -> bool:
|
||||
return not link.is_archived
|
||||
|
||||
|
||||
def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
|
||||
def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
|
||||
fixed = []
|
||||
cant_fix = []
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
|
||||
@@ -5,6 +5,7 @@ import os
|
||||
from string import Template
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Iterator, Mapping
|
||||
from pathlib import Path
|
||||
|
||||
from .schema import Link
|
||||
from ..system import atomic_write, copy_and_overwrite
|
||||
@@ -40,7 +41,7 @@ TITLE_LOADING_MSG = 'Not yet archived...'
|
||||
### Main Links Index
|
||||
|
||||
@enforce_types
|
||||
def parse_html_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[str]:
|
||||
def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
|
||||
"""parse an archive index html file and return the list of urls"""
|
||||
|
||||
index_path = join(out_dir, HTML_INDEX_FILENAME)
|
||||
@@ -52,7 +53,7 @@ def parse_html_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[str]:
|
||||
return ()
|
||||
|
||||
@enforce_types
|
||||
def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
|
||||
def write_html_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool=False) -> None:
|
||||
"""write the html link index to a given path"""
|
||||
|
||||
copy_and_overwrite(join(TEMPLATES_DIR, FAVICON_FILENAME), join(out_dir, FAVICON_FILENAME))
|
||||
|
||||
@@ -6,7 +6,7 @@ import json as pyjson
|
||||
from pathlib import Path
|
||||
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Iterator, Any
|
||||
from typing import List, Optional, Iterator, Any, Union
|
||||
|
||||
from .schema import Link, ArchiveResult
|
||||
from ..system import atomic_write
|
||||
@@ -42,7 +42,7 @@ MAIN_INDEX_HEADER = {
|
||||
### Main Links Index
|
||||
|
||||
@enforce_types
|
||||
def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
||||
def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
|
||||
"""parse an archive index json file and return the list of links"""
|
||||
|
||||
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
||||
@@ -66,7 +66,7 @@ def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
||||
return ()
|
||||
|
||||
@enforce_types
|
||||
def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
||||
def write_json_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
||||
"""write the json link index to a given path"""
|
||||
|
||||
assert isinstance(links, List), 'Links must be a list, not a generator.'
|
||||
@@ -101,7 +101,7 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_json_link_details(out_dir: str, guess: Optional[bool]=False) -> Optional[Link]:
|
||||
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
|
||||
"""load the json link index from a given directory"""
|
||||
existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
||||
if os.path.exists(existing_index):
|
||||
@@ -115,7 +115,7 @@ def parse_json_link_details(out_dir: str, guess: Optional[bool]=False) -> Option
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_json_links_details(out_dir: str) -> Iterator[Link]:
|
||||
def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
|
||||
"""read through all the archive data folders and return the parsed links"""
|
||||
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
__package__ = 'archivebox.index'
|
||||
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Iterator
|
||||
from django.db.models import QuerySet
|
||||
|
||||
@@ -12,7 +13,7 @@ from ..config import setup_django, OUTPUT_DIR
|
||||
### Main Links Index
|
||||
|
||||
@enforce_types
|
||||
def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
||||
def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
|
||||
setup_django(out_dir, check_db=True)
|
||||
from core.models import Snapshot
|
||||
|
||||
@@ -22,7 +23,7 @@ def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def remove_from_sql_main_index(snapshots: QuerySet, out_dir: str=OUTPUT_DIR) -> None:
|
||||
def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) -> None:
|
||||
setup_django(out_dir, check_db=True)
|
||||
from django.db import transaction
|
||||
|
||||
@@ -43,7 +44,7 @@ def write_link_to_sql_index(link: Link):
|
||||
|
||||
|
||||
@enforce_types
|
||||
def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
||||
def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
||||
setup_django(out_dir, check_db=True)
|
||||
from django.db import transaction
|
||||
|
||||
@@ -53,7 +54,7 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
||||
|
||||
|
||||
@enforce_types
|
||||
def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
||||
def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
|
||||
setup_django(out_dir, check_db=True)
|
||||
from core.models import Snapshot
|
||||
from django.db import transaction
|
||||
@@ -70,7 +71,7 @@ def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
||||
|
||||
|
||||
@enforce_types
|
||||
def list_migrations(out_dir: str=OUTPUT_DIR) -> List[Tuple[bool, str]]:
|
||||
def list_migrations(out_dir: Path=OUTPUT_DIR) -> List[Tuple[bool, str]]:
|
||||
setup_django(out_dir, check_db=False)
|
||||
from django.core.management import call_command
|
||||
out = StringIO()
|
||||
@@ -87,7 +88,7 @@ def list_migrations(out_dir: str=OUTPUT_DIR) -> List[Tuple[bool, str]]:
|
||||
return migrations
|
||||
|
||||
@enforce_types
|
||||
def apply_migrations(out_dir: str=OUTPUT_DIR) -> List[str]:
|
||||
def apply_migrations(out_dir: Path=OUTPUT_DIR) -> List[str]:
|
||||
setup_django(out_dir, check_db=False)
|
||||
from django.core.management import call_command
|
||||
null, out = StringIO(), StringIO()
|
||||
@@ -98,7 +99,7 @@ def apply_migrations(out_dir: str=OUTPUT_DIR) -> List[str]:
|
||||
return [line.strip() for line in out.readlines() if line.strip()]
|
||||
|
||||
@enforce_types
|
||||
def get_admins(out_dir: str=OUTPUT_DIR) -> List[str]:
|
||||
def get_admins(out_dir: Path=OUTPUT_DIR) -> List[str]:
|
||||
setup_django(out_dir, check_db=False)
|
||||
from django.contrib.auth.models import User
|
||||
return User.objects.filter(is_superuser=True)
|
||||
|
||||
Reference in New Issue
Block a user