move final legacy config to plugins and fix archivebox config cmd and add search opt

This commit is contained in:
Nick Sweeting
2024-10-21 02:56:00 -07:00
parent 115f89fd8b
commit b3107ab830
20 changed files with 379 additions and 275 deletions

View File

@@ -10,10 +10,6 @@ from datetime import datetime, timezone
from django.db.models import QuerySet
from archivebox.config.legacy import (
SAVE_ALLOWLIST_PTN,
SAVE_DENYLIST_PTN,
)
from ..index.schema import ArchiveResult, Link
from ..index.sql import write_link_to_sql_index
from ..index import (
@@ -82,27 +78,30 @@ ARCHIVE_METHODS_INDEXING_PRECEDENCE = [
@enforce_types
def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]:
from archivebox.config.common import ARCHIVING_CONFIG
DEFAULT_METHODS = get_default_archive_methods()
allowed_methods = {
m for pat, methods in
SAVE_ALLOWLIST_PTN.items()
if pat.search(link.url)
for m in methods
} or { m[0] for m in DEFAULT_METHODS }
method_name
for url_pattern, methods in ARCHIVING_CONFIG.SAVE_ALLOWLIST_PTNS.items()
for method_name in methods
if url_pattern.search(link.url)
} or { method[0] for method in DEFAULT_METHODS }
denied_methods = {
m for pat, methods in
SAVE_DENYLIST_PTN.items()
if pat.search(link.url)
for m in methods
method_name
for url_pattern, methods in ARCHIVING_CONFIG.SAVE_DENYLIST_PTNS.items()
for method_name in methods
if url_pattern.search(link.url)
}
allowed_methods -= denied_methods
return (m for m in DEFAULT_METHODS if m[0] in allowed_methods)
return [method for method in DEFAULT_METHODS if method[0] in allowed_methods]
@enforce_types
def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
ARCHIVE_METHODS = get_default_archive_methods()
return [x[0] for x in ARCHIVE_METHODS if x[0] not in to_ignore]
return [method[0] for method in ARCHIVE_METHODS if method[0] not in to_ignore]
@enforce_types
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link:

View File

@@ -7,10 +7,11 @@ from typing import Optional
from archivebox.config import VERSION
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.config.legacy import SAVE_HTMLTOTEXT
from archivebox.misc.system import atomic_write
from archivebox.misc.util import enforce_types, is_static_file
from archivebox.plugins_extractor.htmltotext.config import HTMLTOTEXT_CONFIG
from ..logging_util import TimedProgress
from ..index.schema import Link, ArchiveResult, ArchiveError
from .title import get_html
@@ -114,7 +115,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SAVE_HTMLTOTEXT
return HTMLTOTEXT_CONFIG.SAVE_HTMLTOTEXT
@enforce_types