move final legacy config to plugins and fix archivebox config cmd and add search opt

This commit is contained in:
Nick Sweeting
2024-10-21 02:56:00 -07:00
parent 115f89fd8b
commit b3107ab830
20 changed files with 379 additions and 275 deletions

View File

@@ -17,7 +17,6 @@ from archivebox.misc.util import scheme, enforce_types, ExtendedEncoder
from archivebox.config import DATA_DIR, CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.config.legacy import URL_DENYLIST_PTN, URL_ALLOWLIST_PTN
from ..logging_util import (
TimedProgress,
@@ -126,6 +125,7 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
@enforce_types
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived"""
for link in links:
try:
urlparse(link.url)
@@ -133,9 +133,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
continue
if scheme(link.url) not in ('http', 'https', 'ftp'):
continue
if URL_DENYLIST_PTN and URL_DENYLIST_PTN.search(link.url):
if ARCHIVING_CONFIG.URL_DENYLIST_PTN and ARCHIVING_CONFIG.URL_DENYLIST_PTN.search(link.url):
continue
if URL_ALLOWLIST_PTN and (not URL_ALLOWLIST_PTN.search(link.url)):
if ARCHIVING_CONFIG.URL_ALLOWLIST_PTN and (not ARCHIVING_CONFIG.URL_ALLOWLIST_PTN.search(link.url)):
continue
yield link