Merge branch 'dev' into search_index_extract_html_text

This commit is contained in:
Nick Sweeting
2023-10-27 23:09:28 -07:00
committed by GitHub
29 changed files with 3230 additions and 1654 deletions

View File

@@ -22,8 +22,8 @@ from ..config import (
JSON_INDEX_FILENAME,
OUTPUT_DIR,
TIMEOUT,
URL_BLACKLIST_PTN,
URL_WHITELIST_PTN,
URL_DENYLIST_PTN,
URL_ALLOWLIST_PTN,
stderr,
OUTPUT_PERMISSIONS
)
@@ -142,9 +142,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
continue
if scheme(link.url) not in ('http', 'https', 'ftp'):
continue
if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
if URL_DENYLIST_PTN and URL_DENYLIST_PTN.search(link.url):
continue
if URL_WHITELIST_PTN and (not URL_WHITELIST_PTN.search(link.url)):
if URL_ALLOWLIST_PTN and (not URL_ALLOWLIST_PTN.search(link.url)):
continue
yield link