mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-03 06:17:53 +10:00
fix: Improve headers handling
This commit is contained in:
committed by
Cristian Vargas
parent
a40af98ced
commit
62ed11a5ca
@@ -85,6 +85,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
||||
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
|
||||
'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
|
||||
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
|
||||
'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)},
|
||||
'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)},
|
||||
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
|
||||
'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
|
||||
|
||||
@@ -16,6 +16,7 @@ from ..config import (
|
||||
CURL_USER_AGENT,
|
||||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SAVE_HEADERS
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
@@ -24,7 +25,7 @@ def should_save_headers(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
|
||||
output = Path(out_dir or link.link_dir) / 'headers.json'
|
||||
return not output.exists()
|
||||
return not output.exists() and SAVE_HEADERS
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
||||
@@ -186,13 +186,17 @@ def get_headers(url: str, timeout: int=None) -> str:
|
||||
headers={'User-Agent': WGET_USER_AGENT},
|
||||
verify=CHECK_SSL_VALIDITY,
|
||||
timeout=timeout,
|
||||
allow_redirects=True
|
||||
)
|
||||
if response.status_code >= 400:
|
||||
raise RequestException
|
||||
except RequestException:
|
||||
response = requests.get(
|
||||
url,
|
||||
headers={'User-Agent': WGET_USER_AGENT},
|
||||
verify=CHECK_SSL_VALIDITY,
|
||||
timeout=timeout,
|
||||
stream=True
|
||||
)
|
||||
|
||||
return pyjson.dumps(dict(response.headers), indent=4)
|
||||
|
||||
Reference in New Issue
Block a user